feat: improve web_fetch readability extraction
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
- CLI/macOS: sync remote SSH target/identity to config and let `gateway status` auto-infer SSH targets (ssh-config aware).
|
||||
- Telegram: scope inline buttons with allowlist default + callback gating in DMs/groups.
|
||||
- Telegram: default reaction notifications to own.
|
||||
- Tools: improve `web_fetch` extraction using Readability (with fallback).
|
||||
- Heartbeat: tighten prompt guidance + suppress duplicate alerts for 24h. (#980) — thanks @voidserf.
|
||||
- Repo: ignore local identity files to avoid accidental commits. (#1001) — thanks @gerardward2007.
|
||||
- Sessions/Security: add `session.dmScope` for multi-user DM isolation and audit warnings. (#948) — thanks @Alphonse-arianee.
|
||||
@@ -95,7 +96,7 @@
|
||||
- macOS: resolve gateway token/password using config mode/remote URL, and warn when `launchctl setenv` overrides config. (#1022, #1021) — thanks @kkarimi.
|
||||
- Telegram: allow reply-chain messages to bypass mention gating in groups. (#1038) — thanks @adityashaw2.
|
||||
- Groups: treat replies to the bot as implicit mentions across supported channels.
|
||||
- Security: bump `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97.
|
||||
- Security: bump dependency `tar` to 7.5.3 to fix GHSA-8qq5-rm4j-mr97.
|
||||
|
||||
## 2026.1.14-1
|
||||
|
||||
|
||||
@@ -1709,11 +1709,12 @@ Legacy: `tools.bash` is still accepted as an alias.
|
||||
- `tools.web.search.maxResults` (1–10, default 5)
|
||||
- `tools.web.search.timeoutSeconds` (default 30)
|
||||
- `tools.web.search.cacheTtlMinutes` (default 15)
|
||||
- `tools.web.fetch.enabled` (default false; sandboxed sessions auto-enable unless set to false)
|
||||
- `tools.web.fetch.enabled` (default true)
|
||||
- `tools.web.fetch.maxChars` (default 50000)
|
||||
- `tools.web.fetch.timeoutSeconds` (default 30)
|
||||
- `tools.web.fetch.cacheTtlMinutes` (default 15)
|
||||
- `tools.web.fetch.userAgent` (optional override)
|
||||
- `tools.web.fetch.readability` (default true; disable to use basic HTML cleanup only)
|
||||
|
||||
`agents.defaults.subagents` configures sub-agent defaults:
|
||||
- `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call.
|
||||
|
||||
@@ -116,7 +116,8 @@ Fetch a URL and extract readable content.
|
||||
maxChars: 50000,
|
||||
timeoutSeconds: 30,
|
||||
cacheTtlMinutes: 15,
|
||||
userAgent: "clawdbot/2026.1.15"
|
||||
userAgent: "clawdbot/2026.1.15",
|
||||
readability: true
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -130,7 +131,8 @@ Fetch a URL and extract readable content.
|
||||
- `maxChars` (truncate long pages)
|
||||
|
||||
Notes:
|
||||
- `web_fetch` uses Readability (main-content extraction) by default and falls back to basic HTML cleanup if it fails.
|
||||
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
|
||||
- Responses are cached (default 15 minutes) to reduce repeated fetches.
|
||||
- If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`.
|
||||
- If the Brave key is missing, `web_search` returns a short setup hint with a docs link.
|
||||
- If the Brave key is missing, `web_search` returns a short setup hint with a docs link.
|
||||
|
||||
@@ -142,6 +142,7 @@
|
||||
"@mariozechner/pi-ai": "0.46.0",
|
||||
"@mariozechner/pi-coding-agent": "^0.46.0",
|
||||
"@mariozechner/pi-tui": "^0.46.0",
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"@sinclair/typebox": "0.34.47",
|
||||
"@slack/bolt": "^4.6.0",
|
||||
"@slack/web-api": "^7.13.0",
|
||||
@@ -162,6 +163,7 @@
|
||||
"hono": "4.11.4",
|
||||
"jiti": "^2.6.1",
|
||||
"json5": "^2.2.3",
|
||||
"linkedom": "^0.18.12",
|
||||
"long": "5.3.2",
|
||||
"markdown-it": "^14.1.0",
|
||||
"osc-progress": "^0.2.0",
|
||||
|
||||
120
pnpm-lock.yaml
generated
120
pnpm-lock.yaml
generated
@@ -40,6 +40,9 @@ importers:
|
||||
'@mariozechner/pi-tui':
|
||||
specifier: ^0.46.0
|
||||
version: 0.46.0
|
||||
'@mozilla/readability':
|
||||
specifier: ^0.6.0
|
||||
version: 0.6.0
|
||||
'@sinclair/typebox':
|
||||
specifier: 0.34.47
|
||||
version: 0.34.47
|
||||
@@ -100,6 +103,9 @@ importers:
|
||||
json5:
|
||||
specifier: ^2.2.3
|
||||
version: 2.2.3
|
||||
linkedom:
|
||||
specifier: ^0.18.12
|
||||
version: 0.18.12
|
||||
long:
|
||||
specifier: 5.3.2
|
||||
version: 5.3.2
|
||||
@@ -1036,6 +1042,10 @@ packages:
|
||||
'@mistralai/mistralai@1.10.0':
|
||||
resolution: {integrity: sha512-tdIgWs4Le8vpvPiUEWne6tK0qbVc+jMenujnvTqOjogrJUsCSQhus0tHTU1avDDh5//Rq2dFgP9mWRAdIEoBqg==}
|
||||
|
||||
'@mozilla/readability@0.6.0':
|
||||
resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==}
|
||||
engines: {node: '>=14.0.0'}
|
||||
|
||||
'@napi-rs/wasm-runtime@1.1.1':
|
||||
resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==}
|
||||
|
||||
@@ -2221,6 +2231,9 @@ packages:
|
||||
resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
boolbase@1.0.0:
|
||||
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
|
||||
|
||||
bottleneck@2.19.5:
|
||||
resolution: {integrity: sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw==}
|
||||
|
||||
@@ -2405,6 +2418,16 @@ packages:
|
||||
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
|
||||
engines: {node: '>= 8'}
|
||||
|
||||
css-select@5.2.2:
|
||||
resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==}
|
||||
|
||||
css-what@6.2.2:
|
||||
resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==}
|
||||
engines: {node: '>= 6'}
|
||||
|
||||
cssom@0.5.0:
|
||||
resolution: {integrity: sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==}
|
||||
|
||||
curve25519-js@0.0.4:
|
||||
resolution: {integrity: sha512-axn2UMEnkhyDUPWOwVKBMVIzSQy2ejH2xRGy1wq81dqRwApXfIzfbE3hIX0ZRFBIihf/KDqK158DLwESu4AK1w==}
|
||||
|
||||
@@ -2456,9 +2479,22 @@ packages:
|
||||
docx-preview@0.3.7:
|
||||
resolution: {integrity: sha512-Lav69CTA/IYZPJTsKH7oYeoZjyg96N0wEJMNslGJnZJ+dMUZK85Lt5ASC79yUlD48ecWjuv+rkcmFt6EVPV0Xg==}
|
||||
|
||||
dom-serializer@2.0.0:
|
||||
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||
|
||||
domelementtype@2.3.0:
|
||||
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
|
||||
|
||||
domhandler@5.0.3:
|
||||
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
|
||||
engines: {node: '>= 4'}
|
||||
|
||||
dompurify@3.3.1:
|
||||
resolution: {integrity: sha512-qkdCKzLNtrgPFP1Vo+98FRzJnBRGe4ffyCea9IwHB1fyxPOeNTHpLKYGd4Uk9xvNoH0ZoOjwZxNptyMwqrId1Q==}
|
||||
|
||||
domutils@3.2.2:
|
||||
resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==}
|
||||
|
||||
dotenv@17.2.3:
|
||||
resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==}
|
||||
engines: {node: '>=12'}
|
||||
@@ -2493,6 +2529,10 @@ packages:
|
||||
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
||||
entities@6.0.1:
|
||||
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
||||
env-var@7.5.0:
|
||||
resolution: {integrity: sha512-mKZOzLRN0ETzau2W2QXefbFjo5EF4yWq28OyKb9ICdeNhHJlOE/pHHnz4hdYJ9cNZXcJHo5xN4OT4pzuSHSNvA==}
|
||||
engines: {node: '>=10'}
|
||||
@@ -2767,9 +2807,15 @@ packages:
|
||||
html-escaper@2.0.2:
|
||||
resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==}
|
||||
|
||||
html-escaper@3.0.3:
|
||||
resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==}
|
||||
|
||||
html-parse-string@0.0.9:
|
||||
resolution: {integrity: sha512-wyGnsOolHbNrcb8N6bdJF4EHyzd3zVGCb9/mBxeNjAYBDOZqD7YkqLBz7kXtdgHwNnV8lN/BpSDpsI1zm8Sd8g==}
|
||||
|
||||
htmlparser2@10.0.0:
|
||||
resolution: {integrity: sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==}
|
||||
|
||||
http-errors@2.0.1:
|
||||
resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==}
|
||||
engines: {node: '>= 0.8'}
|
||||
@@ -3037,6 +3083,15 @@ packages:
|
||||
limiter@1.1.5:
|
||||
resolution: {integrity: sha512-FWWMIEOxz3GwUI4Ts/IvgVy6LPvoMPgjMdQ185nN6psJyBJ4yOpzqm695/h5umdLJg2vW3GR5iG11MAkR2AzJA==}
|
||||
|
||||
linkedom@0.18.12:
|
||||
resolution: {integrity: sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==}
|
||||
engines: {node: '>=16'}
|
||||
peerDependencies:
|
||||
canvas: '>= 2'
|
||||
peerDependenciesMeta:
|
||||
canvas:
|
||||
optional: true
|
||||
|
||||
linkify-it@5.0.0:
|
||||
resolution: {integrity: sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==}
|
||||
|
||||
@@ -3307,6 +3362,9 @@ packages:
|
||||
engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0}
|
||||
deprecated: This package is no longer supported.
|
||||
|
||||
nth-check@2.1.1:
|
||||
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
|
||||
|
||||
object-assign@4.1.1:
|
||||
resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
@@ -3982,6 +4040,9 @@ packages:
|
||||
uhtml@5.0.9:
|
||||
resolution: {integrity: sha512-qPyu3vGilaLe6zrjOCD/xezWEHLwdevxmbY3hzyhT25KBDF4F7YYW3YZcL3kylD/6dMoVISHjn8ggV3+9FY+5g==}
|
||||
|
||||
uhyphen@0.2.0:
|
||||
resolution: {integrity: sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==}
|
||||
|
||||
uint8array-extras@1.5.0:
|
||||
resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==}
|
||||
engines: {node: '>=18'}
|
||||
@@ -5253,6 +5314,8 @@ snapshots:
|
||||
zod: 3.25.76
|
||||
zod-to-json-schema: 3.25.1(zod@3.25.76)
|
||||
|
||||
'@mozilla/readability@0.6.0': {}
|
||||
|
||||
'@napi-rs/wasm-runtime@1.1.1':
|
||||
dependencies:
|
||||
'@emnapi/core': 1.8.1
|
||||
@@ -6544,6 +6607,8 @@ snapshots:
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
boolbase@1.0.0: {}
|
||||
|
||||
bottleneck@2.19.5: {}
|
||||
|
||||
bowser@2.13.1: {}
|
||||
@@ -6745,6 +6810,18 @@ snapshots:
|
||||
shebang-command: 2.0.0
|
||||
which: 2.0.2
|
||||
|
||||
css-select@5.2.2:
|
||||
dependencies:
|
||||
boolbase: 1.0.0
|
||||
css-what: 6.2.2
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.2.2
|
||||
nth-check: 2.1.1
|
||||
|
||||
css-what@6.2.2: {}
|
||||
|
||||
cssom@0.5.0: {}
|
||||
|
||||
curve25519-js@0.0.4: {}
|
||||
|
||||
data-uri-to-buffer@4.0.1: {}
|
||||
@@ -6777,10 +6854,28 @@ snapshots:
|
||||
dependencies:
|
||||
jszip: 3.10.1
|
||||
|
||||
dom-serializer@2.0.0:
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
entities: 4.5.0
|
||||
|
||||
domelementtype@2.3.0: {}
|
||||
|
||||
domhandler@5.0.3:
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
|
||||
dompurify@3.3.1:
|
||||
optionalDependencies:
|
||||
'@types/trusted-types': 2.0.7
|
||||
|
||||
domutils@3.2.2:
|
||||
dependencies:
|
||||
dom-serializer: 2.0.0
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
|
||||
dotenv@17.2.3: {}
|
||||
|
||||
dunder-proto@1.0.1:
|
||||
@@ -6808,6 +6903,8 @@ snapshots:
|
||||
|
||||
entities@4.5.0: {}
|
||||
|
||||
entities@6.0.1: {}
|
||||
|
||||
env-var@7.5.0:
|
||||
optional: true
|
||||
|
||||
@@ -7157,8 +7254,17 @@ snapshots:
|
||||
|
||||
html-escaper@2.0.2: {}
|
||||
|
||||
html-escaper@3.0.3: {}
|
||||
|
||||
html-parse-string@0.0.9: {}
|
||||
|
||||
htmlparser2@10.0.0:
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.2.2
|
||||
entities: 6.0.1
|
||||
|
||||
http-errors@2.0.1:
|
||||
dependencies:
|
||||
depd: 2.0.0
|
||||
@@ -7436,6 +7542,14 @@ snapshots:
|
||||
|
||||
limiter@1.1.5: {}
|
||||
|
||||
linkedom@0.18.12:
|
||||
dependencies:
|
||||
css-select: 5.2.2
|
||||
cssom: 0.5.0
|
||||
html-escaper: 3.0.3
|
||||
htmlparser2: 10.0.0
|
||||
uhyphen: 0.2.0
|
||||
|
||||
linkify-it@5.0.0:
|
||||
dependencies:
|
||||
uc.micro: 2.1.0
|
||||
@@ -7741,6 +7855,10 @@ snapshots:
|
||||
set-blocking: 2.0.0
|
||||
optional: true
|
||||
|
||||
nth-check@2.1.1:
|
||||
dependencies:
|
||||
boolbase: 1.0.0
|
||||
|
||||
object-assign@4.1.1: {}
|
||||
|
||||
object-inspect@1.13.4: {}
|
||||
@@ -8537,6 +8655,8 @@ snapshots:
|
||||
dependencies:
|
||||
'@webreflection/alien-signals': 0.3.2
|
||||
|
||||
uhyphen@0.2.0: {}
|
||||
|
||||
uint8array-extras@1.5.0: {}
|
||||
|
||||
undici-types@7.16.0: {}
|
||||
|
||||
49
src/agents/tools/web-tools.readability.test.ts
Normal file
49
src/agents/tools/web-tools.readability.test.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { extractReadableContent } from "./web-tools.js";
|
||||
|
||||
const SAMPLE_HTML = `<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Example Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<nav>
|
||||
<ul>
|
||||
<li><a href="/home">Home</a></li>
|
||||
<li><a href="/about">About</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
<main>
|
||||
<article>
|
||||
<h1>Example Article</h1>
|
||||
<p>Main content starts here with enough words to satisfy readability.</p>
|
||||
<p>Second paragraph for a bit more signal.</p>
|
||||
</article>
|
||||
</main>
|
||||
<footer>Footer text</footer>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
describe("web fetch readability", () => {
|
||||
it("extracts readable text", async () => {
|
||||
const result = await extractReadableContent({
|
||||
html: SAMPLE_HTML,
|
||||
url: "https://example.com/article",
|
||||
extractMode: "text",
|
||||
});
|
||||
expect(result?.text).toContain("Main content starts here");
|
||||
expect(result?.title).toBe("Example Article");
|
||||
});
|
||||
|
||||
it("extracts readable markdown", async () => {
|
||||
const result = await extractReadableContent({
|
||||
html: SAMPLE_HTML,
|
||||
url: "https://example.com/article",
|
||||
extractMode: "markdown",
|
||||
});
|
||||
expect(result?.text).toContain("Main content starts here");
|
||||
expect(result?.title).toBe("Example Article");
|
||||
});
|
||||
});
|
||||
@@ -118,6 +118,11 @@ function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boole
|
||||
return true;
|
||||
}
|
||||
|
||||
function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
|
||||
if (typeof fetch?.readability === "boolean") return fetch.readability;
|
||||
return true;
|
||||
}
|
||||
|
||||
function resolveSearchApiKey(search?: WebSearchConfig): string | undefined {
|
||||
const fromConfig =
|
||||
search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : "";
|
||||
@@ -300,6 +305,37 @@ async function readResponseText(res: Response): Promise<string> {
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractReadableContent(params: {
|
||||
html: string;
|
||||
url: string;
|
||||
extractMode: (typeof EXTRACT_MODES)[number];
|
||||
}): Promise<{ text: string; title?: string } | null> {
|
||||
try {
|
||||
const [{ Readability }, { parseHTML }] = await Promise.all([
|
||||
import("@mozilla/readability"),
|
||||
import("linkedom"),
|
||||
]);
|
||||
const { document } = parseHTML(params.html);
|
||||
try {
|
||||
(document as { baseURI?: string }).baseURI = params.url;
|
||||
} catch {
|
||||
// Best-effort base URI for relative links.
|
||||
}
|
||||
const reader = new Readability(document, { charThreshold: 0 });
|
||||
const parsed = reader.parse();
|
||||
if (!parsed?.content) return null;
|
||||
const title = parsed.title || undefined;
|
||||
if (params.extractMode === "text") {
|
||||
const text = normalizeWhitespace(parsed.textContent ?? "");
|
||||
return { text, title };
|
||||
}
|
||||
const rendered = htmlToMarkdown(parsed.content);
|
||||
return { text: rendered.text, title: title ?? rendered.title };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function runWebSearch(params: {
|
||||
query: string;
|
||||
count: number;
|
||||
@@ -377,6 +413,7 @@ async function runWebFetch(params: {
|
||||
timeoutSeconds: number;
|
||||
cacheTtlMs: number;
|
||||
userAgent: string;
|
||||
readabilityEnabled: boolean;
|
||||
}): Promise<Record<string, unknown>> {
|
||||
const cacheKey = normalizeCacheKey(
|
||||
`fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
|
||||
@@ -415,9 +452,25 @@ async function runWebFetch(params: {
|
||||
let title: string | undefined;
|
||||
let text = body;
|
||||
if (contentType.includes("text/html")) {
|
||||
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
|
||||
text = parsed.text;
|
||||
title = parsed.title;
|
||||
if (params.readabilityEnabled) {
|
||||
const readable = await extractReadableContent({
|
||||
html: body,
|
||||
url: res.url || params.url,
|
||||
extractMode: params.extractMode,
|
||||
});
|
||||
if (readable?.text) {
|
||||
text = readable.text;
|
||||
title = readable.title;
|
||||
} else {
|
||||
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
|
||||
text = parsed.text;
|
||||
title = parsed.title;
|
||||
}
|
||||
} else {
|
||||
const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body);
|
||||
text = parsed.text;
|
||||
title = parsed.title;
|
||||
}
|
||||
} else if (contentType.includes("application/json")) {
|
||||
try {
|
||||
text = JSON.stringify(JSON.parse(body), null, 2);
|
||||
@@ -490,6 +543,7 @@ export function createWebFetchTool(options?: {
|
||||
}): AnyAgentTool | null {
|
||||
const fetch = resolveFetchConfig(options?.config);
|
||||
if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
|
||||
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
|
||||
const userAgent =
|
||||
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
|
||||
`clawdbot/${VERSION}`;
|
||||
@@ -511,6 +565,7 @@ export function createWebFetchTool(options?: {
|
||||
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
|
||||
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
|
||||
userAgent,
|
||||
readabilityEnabled,
|
||||
});
|
||||
return jsonResult(result);
|
||||
},
|
||||
|
||||
@@ -262,6 +262,8 @@ const FIELD_HELP: Record<string, string> = {
|
||||
"tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
|
||||
"tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
|
||||
"tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.",
|
||||
"tools.web.fetch.readability":
|
||||
"Use Readability to extract main content from HTML (fallbacks to basic HTML cleanup).",
|
||||
"channels.slack.allowBots":
|
||||
"Allow bot-authored messages to trigger Slack replies (default: false).",
|
||||
"channels.slack.thread.historyScope":
|
||||
|
||||
@@ -99,7 +99,7 @@ export type ToolsConfig = {
|
||||
cacheTtlMinutes?: number;
|
||||
};
|
||||
fetch?: {
|
||||
/** Enable web fetch tool (default: false). */
|
||||
/** Enable web fetch tool (default: true). */
|
||||
enabled?: boolean;
|
||||
/** Max characters to return from fetched content. */
|
||||
maxChars?: number;
|
||||
@@ -109,6 +109,8 @@ export type ToolsConfig = {
|
||||
cacheTtlMinutes?: number;
|
||||
/** Override User-Agent header for fetch requests. */
|
||||
userAgent?: string;
|
||||
/** Use Readability to extract main content (default: true). */
|
||||
readability?: boolean;
|
||||
};
|
||||
};
|
||||
audio?: {
|
||||
|
||||
Reference in New Issue
Block a user