diff --git a/CHANGELOG.md b/CHANGELOG.md index cd84926e7..e55382ecc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Docs: expand gateway security hardening guidance and incident response checklist. - Docs: document DM history limits for channel DMs. (#883) — thanks @pkrmf. - Security: add detect-secrets CI scan and baseline guidance. (#227) — thanks @Hyaxia. +- Tools: add `web_search`/`web_fetch` (Brave API), auto-enable `web_fetch` for sandboxed sessions, and remove the `brave-search` skill. ### Fixes - Browser: add tests for snapshot labels/efficient query params and labeled image responses. diff --git a/docs/docs.json b/docs/docs.json index 2d2e5387d..931620a66 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -889,6 +889,7 @@ "tools", "plugin", "tools/exec", + "tools/web", "tools/apply-patch", "tools/elevated", "tools/browser", diff --git a/docs/gateway/configuration-examples.md b/docs/gateway/configuration-examples.md index 2b3999779..9e4bd8701 100644 --- a/docs/gateway/configuration-examples.md +++ b/docs/gateway/configuration-examples.md @@ -384,7 +384,7 @@ Save to `~/.clawdbot/clawdbot.json` and you can DM the bot from that number. }, skills: { - allowBundled: ["brave-search", "gemini"], + allowBundled: ["gemini", "peekaboo"], load: { extraDirs: ["~/Projects/agent-scripts/skills"] }, diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index cf4e82647..72778e02c 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1605,6 +1605,18 @@ of `every`, keep `HEARTBEAT.md` tiny, and/or choose a cheaper `model`. Note: `applyPatch` is only under `tools.exec` (no `tools.bash` alias). Legacy: `tools.bash` is still accepted as an alias. +`tools.web` configures web search + fetch tools: +- `tools.web.search.enabled` (default: true when key is present) +- `tools.web.search.apiKey` (or `BRAVE_API_KEY` env var) +- `tools.web.search.maxResults` (1–10, default 5) +- `tools.web.search.timeoutSeconds` (default 30) +- `tools.web.search.cacheTtlMinutes` (default 15) +- `tools.web.fetch.enabled` (default false; sandboxed sessions auto-enable unless set to false) +- `tools.web.fetch.maxChars` (default 50000) +- `tools.web.fetch.timeoutSeconds` (default 30) +- `tools.web.fetch.cacheTtlMinutes` (default 15) +- `tools.web.fetch.userAgent` (optional override) + `agents.defaults.subagents` configures sub-agent defaults: - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call. - `maxConcurrent`: max concurrent sub-agent runs (default 1) @@ -1685,6 +1697,7 @@ Tool groups (shorthands) work in **global** and **per-agent** tool policies: - `group:fs`: `read`, `write`, `edit`, `apply_patch` - `group:sessions`: `sessions_list`, `sessions_history`, `sessions_send`, `sessions_spawn`, `session_status` - `group:memory`: `memory_search`, `memory_get` +- `group:web`: `web_search`, `web_fetch` - `group:ui`: `browser`, `canvas` - `group:automation`: `cron`, `gateway` - `group:messaging`: `message` @@ -2210,7 +2223,7 @@ Example: ```json5 { skills: { - allowBundled: ["brave-search", "gemini"], + allowBundled: ["gemini", "peekaboo"], load: { extraDirs: [ "~/Projects/agent-scripts/skills", diff --git a/docs/tools/index.md b/docs/tools/index.md index 2acb9bfc9..3362a6e56 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -131,6 +131,7 @@ Available groups: - `group:fs`: `read`, `write`, `edit`, `apply_patch` - `group:sessions`: `sessions_list`, `sessions_history`, `sessions_send`, `sessions_spawn`, `session_status` - `group:memory`: `memory_search`, `memory_get` +- `group:web`: `web_search`, `web_fetch` - `group:ui`: `browser`, `canvas` - `group:automation`: `cron`, `gateway` - `group:messaging`: `message` @@ -188,6 +189,33 @@ Notes: - `log` supports line-based `offset`/`limit` (omit `offset` to grab the last N lines). - `process` is scoped per agent; sessions from other agents are not visible. +### `web_search` +Search the web using Brave Search API. + +Core parameters: +- `query` (required) +- `count` (1–10; default from `tools.web.search.maxResults`) + +Notes: +- Requires `BRAVE_API_KEY` or `tools.web.search.apiKey`. +- Enable via `tools.web.search.enabled`. +- Responses are cached (default 15 min). +- See [Web tools](/tools/web) for setup. + +### `web_fetch` +Fetch and extract readable content from a URL (HTML → markdown/text). + +Core parameters: +- `url` (required) +- `extractMode` (`markdown` | `text`) +- `maxChars` (truncate long pages) + +Notes: +- Enable via `tools.web.fetch.enabled`. +- Responses are cached (default 15 min). +- For JS-heavy sites, prefer the browser tool. +- See [Web tools](/tools/web) for setup. + ### `browser` Control the dedicated clawd browser. diff --git a/docs/tools/skills-config.md b/docs/tools/skills-config.md index 392eca28c..89d09a35b 100644 --- a/docs/tools/skills-config.md +++ b/docs/tools/skills-config.md @@ -11,7 +11,7 @@ All skills-related configuration lives under `skills` in `~/.clawdbot/clawdbot.j ```json5 { skills: { - allowBundled: ["brave-search", "gemini"], + allowBundled: ["gemini", "peekaboo"], load: { extraDirs: [ "~/Projects/agent-scripts/skills", diff --git a/docs/tools/web.md b/docs/tools/web.md new file mode 100644 index 000000000..4d17561d1 --- /dev/null +++ b/docs/tools/web.md @@ -0,0 +1,103 @@ +--- +summary: "Web search + fetch tools (Brave Search API)" +read_when: + - You want to enable web_search or web_fetch + - You need Brave Search API key setup +--- + +# Web tools + +Clawdbot ships two lightweight web tools: + +- `web_search` — Brave Search API queries (fast, structured results). +- `web_fetch` — HTTP fetch + readable extraction (HTML → markdown/text). + +These are **not** browser automation. For JS-heavy sites or logins, use the +[Browser tool](/tools/browser). + +## How it works + +- `web_search` calls Brave’s Search API and returns structured results + (title, URL, snippet). No browser is involved. +- Results are cached by query for 15 minutes (configurable). +- `web_fetch` does a plain HTTP GET and extracts readable content + (HTML → markdown/text). It does **not** execute JavaScript. +- In sandboxed sessions, `web_fetch` is enabled automatically (unless explicitly disabled). + +## web_search + +Search the web with Brave’s API. + +### Requirements + +- `tools.web.search.enabled: true` +- Brave API key via `BRAVE_API_KEY` **or** `tools.web.search.apiKey` + +### Config + +```json5 +{ + tools: { + web: { + search: { + enabled: true, + apiKey: "BRAVE_API_KEY_HERE", // optional if BRAVE_API_KEY is set + maxResults: 5, + timeoutSeconds: 30, + cacheTtlMinutes: 15 + } + } + } +} +``` + +### Tool parameters + +- `query` (required) +- `count` (1–10; default from config) + +## web_fetch + +Fetch a URL and extract readable content. + +### Requirements + +- `tools.web.fetch.enabled: true` + +### Config + +```json5 +{ + tools: { + web: { + fetch: { + enabled: true, + maxChars: 50000, + timeoutSeconds: 30, + cacheTtlMinutes: 15, + userAgent: "clawdbot/2026.1.14" + } + } + } +} +``` + +### Tool parameters + +- `url` (required, http/https only) +- `extractMode` (`markdown` | `text`) +- `maxChars` (truncate long pages) + +Notes: +- `web_fetch` is best-effort extraction; some sites will need the browser tool. +- Responses are cached (default 15 minutes) to reduce repeated fetches. +- If you use tool profiles/allowlists, add `web_search`/`web_fetch` or `group:web`. + +## Getting a Brave API key + +1) Create a Brave Search API account at https://brave.com/search/api/ +2) Generate an API key in the dashboard. +3) Set `BRAVE_API_KEY` in your environment or paste it into `tools.web.search.apiKey`. + +Brave provides a free tier plus paid plans; check the Brave API portal for the +current limits and pricing. diff --git a/skills/brave-search/SKILL.md b/skills/brave-search/SKILL.md deleted file mode 100644 index 1e79ad339..000000000 --- a/skills/brave-search/SKILL.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -name: brave-search -description: Web search and content extraction via Brave Search API. -homepage: https://brave.com/search/api -metadata: {"clawdbot":{"emoji":"🦁","requires":{"bins":["node"],"env":["BRAVE_API_KEY"]},"primaryEnv":"BRAVE_API_KEY"}} ---- - -# Brave Search - -Headless web search (and lightweight content extraction) using Brave Search API. No browser required. - -## Search - -```bash -node {baseDir}/scripts/search.mjs "query" -node {baseDir}/scripts/search.mjs "query" -n 10 -node {baseDir}/scripts/search.mjs "query" --content -node {baseDir}/scripts/search.mjs "query" -n 3 --content -``` - -## Extract a page - -```bash -node {baseDir}/scripts/content.mjs "https://example.com/article" -``` - -Notes: -- Needs `BRAVE_API_KEY`. -- Content extraction is best-effort (good for articles; not for app-like sites). -- If a site is blocked or too JS-heavy, prefer the `summarize` skill (it can use a Firecrawl fallback). diff --git a/skills/brave-search/scripts/content.mjs b/skills/brave-search/scripts/content.mjs deleted file mode 100644 index cfdff7836..000000000 --- a/skills/brave-search/scripts/content.mjs +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env node - -function usage() { - console.error(`Usage: content.mjs `); - process.exit(2); -} - -export async function fetchAsMarkdown(url) { - const resp = await fetch(url, { - headers: { "User-Agent": "clawdbot-brave-search/1.0" }, - }); - const html = await resp.text(); - - // Very lightweight “readability-ish” extraction without dependencies: - // - drop script/style/nav/footer - // - strip tags - // - keep paragraphs - const cleaned = html - .replace(//gi, " ") - .replace(//gi, " ") - .replace(/<(nav|footer|header)[\s\S]*?<\/\1>/gi, " ") - .replace(//gi, "\n") - .replace(/<\/p>/gi, "\n\n") - .replace(/<\/div>/gi, "\n") - .replace(/<[^>]+>/g, " ") - .replace(/ /g, " ") - .replace(/&/g, "&") - .replace(/</g, "<") - .replace(/>/g, ">") - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/\s+\n/g, "\n") - .replace(/\n{3,}/g, "\n\n") - .replace(/[ \t]{2,}/g, " ") - .trim(); - - if (!resp.ok) { - return `> Fetch failed (${resp.status}).\n\n${cleaned.slice(0, 2000)}\n`; - } - - const paras = cleaned - .split("\n\n") - .map((p) => p.trim()) - .filter(Boolean) - .slice(0, 30); - - return paras.map((p) => `- ${p}`).join("\n") + "\n"; -} - -const args = process.argv.slice(2); -if (args.length === 0 || args[0] === "-h" || args[0] === "--help") usage(); -const url = args[0]; -process.stdout.write(await fetchAsMarkdown(url)); diff --git a/skills/brave-search/scripts/search.mjs b/skills/brave-search/scripts/search.mjs deleted file mode 100644 index 901ba6c17..000000000 --- a/skills/brave-search/scripts/search.mjs +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env node - -function usage() { - console.error(`Usage: search.mjs "query" [-n 5] [--content]`); - process.exit(2); -} - -const args = process.argv.slice(2); -if (args.length === 0 || args[0] === "-h" || args[0] === "--help") usage(); - -const query = args[0]; -let n = 5; -let withContent = false; - -for (let i = 1; i < args.length; i++) { - const a = args[i]; - if (a === "-n") { - n = Number.parseInt(args[i + 1] ?? "5", 10); - i++; - continue; - } - if (a === "--content") { - withContent = true; - continue; - } - console.error(`Unknown arg: ${a}`); - usage(); -} - -const apiKey = (process.env.BRAVE_API_KEY ?? "").trim(); -if (!apiKey) { - console.error("Missing BRAVE_API_KEY"); - process.exit(1); -} - -const endpoint = new URL("https://api.search.brave.com/res/v1/web/search"); -endpoint.searchParams.set("q", query); -endpoint.searchParams.set("count", String(Math.max(1, Math.min(n, 20)))); -endpoint.searchParams.set("text_decorations", "false"); -endpoint.searchParams.set("safesearch", "moderate"); - -const resp = await fetch(endpoint, { - headers: { - Accept: "application/json", - "X-Subscription-Token": apiKey, - }, -}); - -if (!resp.ok) { - const text = await resp.text().catch(() => ""); - throw new Error(`Brave Search failed (${resp.status}): ${text}`); -} - -const data = await resp.json(); -const results = (data?.web?.results ?? []).slice(0, n); - -const lines = []; -for (const r of results) { - const title = String(r?.title ?? "").trim(); - const url = String(r?.url ?? "").trim(); - const desc = String(r?.description ?? "").trim(); - if (!title || !url) continue; - lines.push(`- ${title}\n ${url}${desc ? `\n ${desc}` : ""}`); -} - -process.stdout.write(lines.join("\n\n") + "\n"); - -if (!withContent) process.exit(0); - -process.stdout.write("\n---\n\n"); -for (const r of results) { - const title = String(r?.title ?? "").trim(); - const url = String(r?.url ?? "").trim(); - if (!url) continue; - process.stdout.write(`# ${title || url}\n${url}\n\n`); - const child = await import("./content.mjs"); - const text = await child.fetchAsMarkdown(url); - process.stdout.write(text.trimEnd() + "\n\n"); -} diff --git a/src/agents/clawdbot-tools.ts b/src/agents/clawdbot-tools.ts index bf97483ba..4ff11a9d1 100644 --- a/src/agents/clawdbot-tools.ts +++ b/src/agents/clawdbot-tools.ts @@ -17,6 +17,7 @@ import { createSessionsHistoryTool } from "./tools/sessions-history-tool.js"; import { createSessionsListTool } from "./tools/sessions-list-tool.js"; import { createSessionsSendTool } from "./tools/sessions-send-tool.js"; import { createSessionsSpawnTool } from "./tools/sessions-spawn-tool.js"; +import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js"; export function createClawdbotTools(options?: { browserControlUrl?: string; @@ -56,6 +57,14 @@ export function createClawdbotTools(options?: { config: options?.config, agentSessionKey: options?.agentSessionKey, }); + const webSearchTool = createWebSearchTool({ + config: options?.config, + sandboxed: options?.sandboxed, + }); + const webFetchTool = createWebFetchTool({ + config: options?.config, + sandboxed: options?.sandboxed, + }); const tools: AnyAgentTool[] = [ createBrowserTool({ defaultControlUrl: options?.browserControlUrl, @@ -103,6 +112,8 @@ export function createClawdbotTools(options?: { config: options?.config, }), ...(memorySearchTool && memoryGetTool ? [memorySearchTool, memoryGetTool] : []), + ...(webSearchTool ? [webSearchTool] : []), + ...(webFetchTool ? [webFetchTool] : []), ...(imageTool ? [imageTool] : []), ]; diff --git a/src/agents/system-prompt.ts b/src/agents/system-prompt.ts index ea6ff99d2..d0d292403 100644 --- a/src/agents/system-prompt.ts +++ b/src/agents/system-prompt.ts @@ -54,6 +54,8 @@ export function buildAgentSystemPrompt(params: { ls: "List directory contents", exec: "Run shell commands", process: "Manage background exec sessions", + web_search: "Search the web (Brave API)", + web_fetch: "Fetch and extract readable content from a URL", // Channel docking: add login tools here when a channel needs interactive linking. browser: "Control web browser", canvas: "Present/eval/snapshot the Canvas", @@ -81,6 +83,8 @@ export function buildAgentSystemPrompt(params: { "ls", "exec", "process", + "web_search", + "web_fetch", "browser", "canvas", "nodes", diff --git a/src/agents/tool-display.json b/src/agents/tool-display.json index e593a97d4..da714a6dd 100644 --- a/src/agents/tool-display.json +++ b/src/agents/tool-display.json @@ -277,6 +277,16 @@ "title": "Memory Get", "detailKeys": ["path", "from", "lines"] }, + "web_search": { + "emoji": "🔎", + "title": "Web Search", + "detailKeys": ["query", "count"] + }, + "web_fetch": { + "emoji": "📄", + "title": "Web Fetch", + "detailKeys": ["url", "extractMode", "maxChars"] + }, "whatsapp_login": { "emoji": "🟢", "title": "WhatsApp Login", diff --git a/src/agents/tool-policy.ts b/src/agents/tool-policy.ts index e771c7104..8d3d2c812 100644 --- a/src/agents/tool-policy.ts +++ b/src/agents/tool-policy.ts @@ -13,6 +13,7 @@ const TOOL_NAME_ALIASES: Record = { export const TOOL_GROUPS: Record = { // NOTE: Keep canonical (lowercase) tool names here. "group:memory": ["memory_search", "memory_get"], + "group:web": ["web_search", "web_fetch"], // Basic workspace/file tools "group:fs": ["read", "write", "edit", "apply_patch"], // Host/runtime execution tools @@ -49,6 +50,8 @@ export const TOOL_GROUPS: Record = { "session_status", "memory_search", "memory_get", + "web_search", + "web_fetch", "image", ], }; diff --git a/src/agents/tools/web-tools.ts b/src/agents/tools/web-tools.ts new file mode 100644 index 000000000..a10d76a2c --- /dev/null +++ b/src/agents/tools/web-tools.ts @@ -0,0 +1,480 @@ +import { Type } from "@sinclair/typebox"; + +import type { ClawdbotConfig } from "../../config/config.js"; +import { VERSION } from "../../version.js"; +import { stringEnum } from "../schema/typebox.js"; +import type { AnyAgentTool } from "./common.js"; +import { jsonResult, readNumberParam, readStringParam } from "./common.js"; + +const SEARCH_PROVIDERS = ["brave"] as const; +const EXTRACT_MODES = ["markdown", "text"] as const; + +const DEFAULT_SEARCH_COUNT = 5; +const MAX_SEARCH_COUNT = 10; +const DEFAULT_FETCH_MAX_CHARS = 50_000; +const DEFAULT_TIMEOUT_SECONDS = 30; +const DEFAULT_CACHE_TTL_MINUTES = 15; +const DEFAULT_CACHE_MAX_ENTRIES = 100; + +const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"; + +type WebSearchConfig = NonNullable["web"] extends infer Web + ? Web extends { search?: infer Search } + ? Search + : undefined + : undefined; + +type WebFetchConfig = NonNullable["web"] extends infer Web + ? Web extends { fetch?: infer Fetch } + ? Fetch + : undefined + : undefined; + +type CacheEntry = { + value: T; + expiresAt: number; + insertedAt: number; +}; + +const SEARCH_CACHE = new Map>>(); +const FETCH_CACHE = new Map>>(); + +const WebSearchSchema = Type.Object({ + query: Type.String({ description: "Search query string." }), + count: Type.Optional( + Type.Number({ + description: "Number of results to return (1-10).", + minimum: 1, + maximum: MAX_SEARCH_COUNT, + }), + ), +}); + +const WebFetchSchema = Type.Object({ + url: Type.String({ description: "HTTP or HTTPS URL to fetch." }), + extractMode: Type.Optional( + stringEnum(EXTRACT_MODES, { + description: 'Extraction mode ("markdown" or "text").', + default: "markdown", + }), + ), + maxChars: Type.Optional( + Type.Number({ + description: "Maximum characters to return (truncates when exceeded).", + minimum: 100, + }), + ), +}); + +type BraveSearchResult = { + title?: string; + url?: string; + description?: string; + age?: string; +}; + +type BraveSearchResponse = { + web?: { + results?: BraveSearchResult[]; + }; +}; + +function resolveSearchConfig(cfg?: ClawdbotConfig): WebSearchConfig { + const search = cfg?.tools?.web?.search; + if (!search || typeof search !== "object") return undefined; + return search as WebSearchConfig; +} + +function resolveFetchConfig(cfg?: ClawdbotConfig): WebFetchConfig { + const fetch = cfg?.tools?.web?.fetch; + if (!fetch || typeof fetch !== "object") return undefined; + return fetch as WebFetchConfig; +} + +function resolveSearchEnabled(params: { search?: WebSearchConfig; sandboxed?: boolean }): boolean { + if (typeof params.search?.enabled === "boolean") return params.search.enabled; + if (params.sandboxed) return true; + return true; +} + +function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boolean }): boolean { + if (typeof params.fetch?.enabled === "boolean") return params.fetch.enabled; + if (params.sandboxed) return true; + return false; +} + +function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { + const fromConfig = + search && "apiKey" in search && typeof search.apiKey === "string" + ? search.apiKey.trim() + : ""; + const fromEnv = (process.env.BRAVE_API_KEY ?? "").trim(); + return fromConfig || fromEnv || undefined; +} + +function resolveSearchProvider(search?: WebSearchConfig): (typeof SEARCH_PROVIDERS)[number] { + const raw = + search && "provider" in search && typeof search.provider === "string" + ? search.provider.trim().toLowerCase() + : ""; + if (raw === "brave") return "brave"; + return "brave"; +} + +function resolveTimeoutSeconds(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(1, Math.floor(parsed)); +} + +function resolveCacheTtlMs(value: unknown, fallbackMinutes: number): number { + const minutes = + typeof value === "number" && Number.isFinite(value) ? Math.max(0, value) : fallbackMinutes; + return Math.round(minutes * 60_000); +} + +function resolveMaxChars(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(100, Math.floor(parsed)); +} + +function resolveSearchCount(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + const clamped = Math.max(1, Math.min(MAX_SEARCH_COUNT, Math.floor(parsed))); + return clamped; +} + +function normalizeCacheKey(value: string): string { + return value.trim().toLowerCase(); +} + +function readCache( + cache: Map>, + key: string, +): { value: T; cached: boolean } | null { + const entry = cache.get(key); + if (!entry) return null; + if (Date.now() > entry.expiresAt) { + cache.delete(key); + return null; + } + return { value: entry.value, cached: true }; +} + +function writeCache( + cache: Map>, + key: string, + value: T, + ttlMs: number, +) { + if (ttlMs <= 0) return; + if (cache.size >= DEFAULT_CACHE_MAX_ENTRIES) { + const oldest = cache.keys().next(); + if (!oldest.done) cache.delete(oldest.value); + } + cache.set(key, { + value, + expiresAt: Date.now() + ttlMs, + insertedAt: Date.now(), + }); +} + +function withTimeout(signal: AbortSignal | undefined, timeoutMs: number): AbortSignal { + if (timeoutMs <= 0) return signal ?? new AbortController().signal; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + if (signal) { + signal.addEventListener( + "abort", + () => { + clearTimeout(timer); + controller.abort(); + }, + { once: true }, + ); + } + controller.signal.addEventListener( + "abort", + () => { + clearTimeout(timer); + }, + { once: true }, + ); + return controller.signal; +} + +function decodeEntities(value: string): string { + return value + .replace(/ /gi, " ") + .replace(/&/gi, "&") + .replace(/"/gi, '"') + .replace(/'/gi, "'") + .replace(/</gi, "<") + .replace(/>/gi, ">") + .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) + .replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); +} + +function stripTags(value: string): string { + return decodeEntities(value.replace(/<[^>]+>/g, "")); +} + +function normalizeWhitespace(value: string): string { + return value + .replace(/\r/g, "") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); +} + +function htmlToMarkdown(html: string): { text: string; title?: string } { + const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i); + const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined; + let text = html + .replace(//gi, "") + .replace(//gi, "") + .replace(//gi, ""); + text = text.replace(/]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, body) => { + const label = normalizeWhitespace(stripTags(body)); + if (!label) return href; + return `[${label}](${href})`; + }); + text = text.replace(/]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => { + const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10)))); + const label = normalizeWhitespace(stripTags(body)); + return `\n${prefix} ${label}\n`; + }); + text = text.replace(/]*>([\s\S]*?)<\/li>/gi, (_, body) => { + const label = normalizeWhitespace(stripTags(body)); + return label ? `\n- ${label}` : ""; + }); + text = text + .replace(/<(br|hr)\s*\/?>/gi, "\n") + .replace(/<\/(p|div|section|article|header|footer|table|tr|ul|ol)>/gi, "\n"); + text = stripTags(text); + text = normalizeWhitespace(text); + return { text, title }; +} + +function htmlToText(html: string): { text: string; title?: string } { + const { text, title } = htmlToMarkdown(html); + return { text, title }; +} + +function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } { + if (value.length <= maxChars) return { text: value, truncated: false }; + return { text: value.slice(0, maxChars), truncated: true }; +} + +function resolveSiteName(url: string | undefined): string | undefined { + if (!url) return undefined; + try { + return new URL(url).hostname; + } catch { + return undefined; + } +} + +async function readResponseText(res: Response): Promise { + try { + return await res.text(); + } catch { + return ""; + } +} + +async function runWebSearch(params: { + query: string; + count: number; + apiKey: string; + timeoutSeconds: number; + cacheTtlMs: number; + provider: (typeof SEARCH_PROVIDERS)[number]; +}): Promise> { + const cacheKey = normalizeCacheKey(`${params.provider}:${params.query}:${params.count}`); + const cached = readCache(SEARCH_CACHE, cacheKey); + if (cached) return { ...cached.value, cached: true }; + + const start = Date.now(); + if (params.provider !== "brave") { + throw new Error("Unsupported web search provider."); + } + + const url = new URL(BRAVE_SEARCH_ENDPOINT); + url.searchParams.set("q", params.query); + url.searchParams.set("count", String(params.count)); + + const res = await fetch(url.toString(), { + method: "GET", + headers: { + Accept: "application/json", + "X-Subscription-Token": params.apiKey, + }, + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + if (!res.ok) { + const detail = await readResponseText(res); + throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`); + } + + const data = (await res.json()) as BraveSearchResponse; + const results = Array.isArray(data.web?.results) ? data.web?.results ?? [] : []; + const mapped = results.map((entry) => ({ + title: entry.title ?? "", + url: entry.url ?? "", + description: entry.description ?? "", + published: entry.age ?? undefined, + siteName: resolveSiteName(entry.url ?? ""), + })); + + const payload = { + query: params.query, + provider: params.provider, + count: mapped.length, + tookMs: Date.now() - start, + results: mapped, + }; + writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; +} + +async function runWebFetch(params: { + url: string; + extractMode: (typeof EXTRACT_MODES)[number]; + maxChars: number; + timeoutSeconds: number; + cacheTtlMs: number; + userAgent: string; +}): Promise> { + const cacheKey = normalizeCacheKey( + `fetch:${params.url}:${params.extractMode}:${params.maxChars}`, + ); + const cached = readCache(FETCH_CACHE, cacheKey); + if (cached) return { ...cached.value, cached: true }; + + let parsedUrl: URL; + try { + parsedUrl = new URL(params.url); + } catch { + throw new Error("Invalid URL: must be http or https"); + } + if (!["http:", "https:"].includes(parsedUrl.protocol)) { + throw new Error("Invalid URL: must be http or https"); + } + + const start = Date.now(); + const res = await fetch(parsedUrl.toString(), { + method: "GET", + headers: { + Accept: "*/*", + "User-Agent": params.userAgent, + }, + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + if (!res.ok) { + const detail = await readResponseText(res); + throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); + } + + const contentType = res.headers.get("content-type") ?? "application/octet-stream"; + const body = await readResponseText(res); + + let title: string | undefined; + let text = body; + if (contentType.includes("text/html")) { + const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); + text = parsed.text; + title = parsed.title; + } else if (contentType.includes("application/json")) { + try { + text = JSON.stringify(JSON.parse(body), null, 2); + } catch { + text = body; + } + } + + const truncated = truncateText(text, params.maxChars); + const payload = { + url: params.url, + finalUrl: res.url || params.url, + status: res.status, + contentType, + title, + extractMode: params.extractMode, + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + }; + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; +} + +export function createWebSearchTool(options?: { + config?: ClawdbotConfig; + sandboxed?: boolean; +}): AnyAgentTool | null { + const search = resolveSearchConfig(options?.config); + if (!resolveSearchEnabled({ search, sandboxed: options?.sandboxed })) return null; + const apiKey = resolveSearchApiKey(search); + if (!apiKey) return null; + return { + label: "Web Search", + name: "web_search", + description: + "Search the web using Brave Search API. Returns titles, URLs, and snippets for fast research.", + parameters: WebSearchSchema, + execute: async (_toolCallId, args) => { + const params = args as Record; + const query = readStringParam(params, "query", { required: true }); + const count = + readNumberParam(params, "count", { integer: true }) ?? search?.maxResults ?? undefined; + const result = await runWebSearch({ + query, + count: resolveSearchCount(count, DEFAULT_SEARCH_COUNT), + apiKey, + timeoutSeconds: resolveTimeoutSeconds(search?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), + cacheTtlMs: resolveCacheTtlMs(search?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), + provider: resolveSearchProvider(search), + }); + return jsonResult(result); + }, + }; +} + +export function createWebFetchTool(options?: { + config?: ClawdbotConfig; + sandboxed?: boolean; +}): AnyAgentTool | null { + const fetch = resolveFetchConfig(options?.config); + if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null; + const userAgent = + (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || + `clawdbot/${VERSION}`; + return { + label: "Web Fetch", + name: "web_fetch", + description: + "Fetch and extract readable content from a URL (HTML → markdown/text). Use for lightweight page access without browser automation.", + parameters: WebFetchSchema, + execute: async (_toolCallId, args) => { + const params = args as Record; + const url = readStringParam(params, "url", { required: true }); + const extractMode = + readStringParam(params, "extractMode") === "text" ? "text" : "markdown"; + const maxChars = readNumberParam(params, "maxChars", { integer: true }); + const result = await runWebFetch({ + url, + extractMode, + maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS), + timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), + cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), + userAgent, + }); + return jsonResult(result); + }, + }; +} diff --git a/src/config/schema.ts b/src/config/schema.ts index 0534ca96a..bccd67758 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -105,6 +105,17 @@ const FIELD_LABELS: Record = { "agents.list[].tools.byProvider": "Agent Tool Policy by Provider", "tools.exec.applyPatch.enabled": "Enable apply_patch", "tools.exec.applyPatch.allowModels": "apply_patch Model Allowlist", + "tools.web.search.enabled": "Enable Web Search Tool", + "tools.web.search.provider": "Web Search Provider", + "tools.web.search.apiKey": "Brave Search API Key", + "tools.web.search.maxResults": "Web Search Max Results", + "tools.web.search.timeoutSeconds": "Web Search Timeout (sec)", + "tools.web.search.cacheTtlMinutes": "Web Search Cache TTL (min)", + "tools.web.fetch.enabled": "Enable Web Fetch Tool", + "tools.web.fetch.maxChars": "Web Fetch Max Chars", + "tools.web.fetch.timeoutSeconds": "Web Fetch Timeout (sec)", + "tools.web.fetch.cacheTtlMinutes": "Web Fetch Cache TTL (min)", + "tools.web.fetch.userAgent": "Web Fetch User-Agent", "gateway.controlUi.basePath": "Control UI Base Path", "gateway.http.endpoints.chatCompletions.enabled": "OpenAI Chat Completions Endpoint", "gateway.reload.mode": "Config Reload Mode", @@ -219,6 +230,17 @@ const FIELD_HELP: Record = { "Experimental. Enables apply_patch for OpenAI models when allowed by tool policy.", "tools.exec.applyPatch.allowModels": 'Optional allowlist of model ids (e.g. "gpt-5.2" or "openai/gpt-5.2").', + "tools.web.search.enabled": "Enable the web_search tool (requires Brave API key).", + "tools.web.search.provider": 'Search provider (only "brave" supported today).', + "tools.web.search.apiKey": "Brave Search API key (fallback: BRAVE_API_KEY env var).", + "tools.web.search.maxResults": "Default number of results to return (1-10).", + "tools.web.search.timeoutSeconds": "Timeout in seconds for web_search requests.", + "tools.web.search.cacheTtlMinutes": "Cache TTL in minutes for web_search results.", + "tools.web.fetch.enabled": "Enable the web_fetch tool (lightweight HTTP fetch).", + "tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).", + "tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.", + "tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.", + "tools.web.fetch.userAgent": "Override User-Agent header for web_fetch requests.", "channels.slack.allowBots": "Allow bot-authored messages to trigger Slack replies (default: false).", "auth.profiles": "Named auth profiles (provider + mode + optional email).", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 9c9706979..8f7a36fa5 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -73,6 +73,34 @@ export type ToolsConfig = { profile?: ToolProfileId; allow?: string[]; deny?: string[]; + web?: { + search?: { + /** Enable web search tool (default: true when API key is present). */ + enabled?: boolean; + /** Search provider (currently "brave"). */ + provider?: "brave"; + /** Brave Search API key (optional; defaults to BRAVE_API_KEY env var). */ + apiKey?: string; + /** Default search results count (1-10). */ + maxResults?: number; + /** Timeout in seconds for search requests. */ + timeoutSeconds?: number; + /** Cache TTL in minutes for search results. */ + cacheTtlMinutes?: number; + }; + fetch?: { + /** Enable web fetch tool (default: false). */ + enabled?: boolean; + /** Max characters to return from fetched content. */ + maxChars?: number; + /** Timeout in seconds for fetch requests. */ + timeoutSeconds?: number; + /** Cache TTL in minutes for fetched content. */ + cacheTtlMinutes?: number; + /** Override User-Agent header for fetch requests. */ + userAgent?: string; + }; + }; audio?: { transcription?: { /** CLI args (template-enabled). */ diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index 452ff080c..7dbd95730 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -114,6 +114,34 @@ export const ToolPolicySchema = z }) .optional(); +export const ToolsWebSearchSchema = z + .object({ + enabled: z.boolean().optional(), + provider: z.union([z.literal("brave")]).optional(), + apiKey: z.string().optional(), + maxResults: z.number().int().positive().optional(), + timeoutSeconds: z.number().int().positive().optional(), + cacheTtlMinutes: z.number().nonnegative().optional(), + }) + .optional(); + +export const ToolsWebFetchSchema = z + .object({ + enabled: z.boolean().optional(), + maxChars: z.number().int().positive().optional(), + timeoutSeconds: z.number().int().positive().optional(), + cacheTtlMinutes: z.number().nonnegative().optional(), + userAgent: z.string().optional(), + }) + .optional(); + +export const ToolsWebSchema = z + .object({ + search: ToolsWebSearchSchema, + fetch: ToolsWebFetchSchema, + }) + .optional(); + export const ToolProfileSchema = z .union([z.literal("minimal"), z.literal("coding"), z.literal("messaging"), z.literal("full")]) .optional(); @@ -245,6 +273,7 @@ export const ToolsSchema = z profile: ToolProfileSchema, allow: z.array(z.string()).optional(), deny: z.array(z.string()).optional(), + web: ToolsWebSchema, audio: z .object({ transcription: ToolsAudioTranscriptionSchema,