diff --git a/docs/brave-search.md b/docs/brave-search.md new file mode 100644 index 000000000..a29143fb4 --- /dev/null +++ b/docs/brave-search.md @@ -0,0 +1,40 @@ +--- +summary: "Brave Search API setup for web_search" +read_when: + - You want to use Brave Search for web_search + - You need a BRAVE_API_KEY or plan details +--- + +# Brave Search API + +Clawdbot uses Brave Search as the default provider for `web_search`. + +## Get an API key + +1) Create a Brave Search API account at https://brave.com/search/api/ +2) In the dashboard, choose the **Data for Search** plan and generate an API key. +3) Store the key in config (recommended) or set `BRAVE_API_KEY` in the Gateway environment. + +## Config example + +```json5 +{ + tools: { + web: { + search: { + provider: "brave", + apiKey: "BRAVE_API_KEY_HERE", + maxResults: 5, + timeoutSeconds: 30 + } + } + } +} +``` + +## Notes + +- The Data for AI plan is **not** compatible with `web_search`. +- Brave provides a free tier plus paid plans; check the Brave API portal for current limits. + +See [Web tools](/tools/web) for the full web_search configuration. diff --git a/docs/perplexity.md b/docs/perplexity.md new file mode 100644 index 000000000..829c2f25f --- /dev/null +++ b/docs/perplexity.md @@ -0,0 +1,76 @@ +--- +summary: "Perplexity Sonar setup for web_search" +read_when: + - You want to use Perplexity Sonar for web search + - You need PERPLEXITY_API_KEY or OpenRouter setup +--- + +# Perplexity Sonar + +Clawdbot can use Perplexity Sonar for the `web_search` tool. You can connect +through Perplexity’s direct API or via OpenRouter. + +## API options + +### Perplexity (direct) + +- Base URL: https://api.perplexity.ai +- Environment variable: `PERPLEXITY_API_KEY` + +### OpenRouter (alternative) + +- Base URL: https://openrouter.ai/api/v1 +- Environment variable: `OPENROUTER_API_KEY` +- Supports prepaid/crypto credits. + +## Config example + +```json5 +{ + tools: { + web: { + search: { + provider: "perplexity", + perplexity: { + apiKey: "pplx-...", + baseUrl: "https://api.perplexity.ai", + model: "perplexity/sonar-pro" + } + } + } + } +} +``` + +## Switching from Brave + +```json5 +{ + tools: { + web: { + search: { + provider: "perplexity", + perplexity: { + apiKey: "pplx-...", + baseUrl: "https://api.perplexity.ai" + } + } + } + } +} +``` + +If both `PERPLEXITY_API_KEY` and `OPENROUTER_API_KEY` are set, set +`tools.web.search.perplexity.baseUrl` (or `tools.web.search.perplexity.apiKey`) +to disambiguate. + +If `PERPLEXITY_API_KEY` is used from the environment and no base URL is set, +Clawdbot defaults to the direct Perplexity endpoint. Set `baseUrl` to override. + +## Models + +- `perplexity/sonar` — fast Q&A with web search +- `perplexity/sonar-pro` (default) — multi-step reasoning + web search +- `perplexity/sonar-reasoning-pro` — deep research + +See [Web tools](/tools/web) for the full web_search configuration. diff --git a/docs/tools/web.md b/docs/tools/web.md index f36f0f0b4..3780538a5 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -1,5 +1,5 @@ --- -summary: "Web search + fetch tools (Brave Search API, Perplexity via OpenRouter)" +summary: "Web search + fetch tools (Brave Search API, Perplexity direct/OpenRouter)" read_when: - You want to enable web_search or web_fetch - You need Brave Search API key setup @@ -33,6 +33,8 @@ These are **not** browser automation. For JS-heavy sites or logins, use the | **Brave** (default) | Fast, structured results, free tier | Traditional search results | `BRAVE_API_KEY` | | **Perplexity** | AI-synthesized answers, citations, real-time | Requires OpenRouter credits | `OPENROUTER_API_KEY` or `PERPLEXITY_API_KEY` | +See [Brave Search setup](/brave-search) and [Perplexity Sonar](/perplexity) for provider-specific details. + Set the provider in config: ```json5 @@ -47,6 +49,25 @@ Set the provider in config: } ``` +Example: switch to Perplexity Sonar (direct API): + +```json5 +{ + tools: { + web: { + search: { + provider: "perplexity", + perplexity: { + apiKey: "pplx-...", + baseUrl: "https://api.perplexity.ai", + model: "perplexity/sonar-pro" + } + } + } + } +} +``` + ## Getting a Brave API key 1) Create a Brave Search API account at https://brave.com/search/api/ @@ -65,7 +86,7 @@ current limits and pricing. environment. For a daemon install, put it in `~/.clawdbot/.env` (or your service environment). See [Env vars](/start/faq#how-does-clawdbot-load-environment-variables). -## Using Perplexity (via OpenRouter) +## Using Perplexity (direct or via OpenRouter) Perplexity Sonar models have built-in web search capabilities and return AI-synthesized answers with citations. You can use them via OpenRouter (no credit card required - supports @@ -103,6 +124,9 @@ crypto/prepaid). **Environment alternative:** set `OPENROUTER_API_KEY` or `PERPLEXITY_API_KEY` in the Gateway environment. For a daemon install, put it in `~/.clawdbot/.env`. +If `PERPLEXITY_API_KEY` is used from the environment and no base URL is set, +Clawdbot defaults to the direct Perplexity endpoint (`https://api.perplexity.ai`). + ### Available Perplexity models | Model | Description | Best for | diff --git a/src/agents/tools/web-fetch-utils.ts b/src/agents/tools/web-fetch-utils.ts new file mode 100644 index 000000000..1a780b9d2 --- /dev/null +++ b/src/agents/tools/web-fetch-utils.ts @@ -0,0 +1,105 @@ +export type ExtractMode = "markdown" | "text"; + +function decodeEntities(value: string): string { + return value + .replace(/ /gi, " ") + .replace(/&/gi, "&") + .replace(/"/gi, '"') + .replace(/'/gi, "'") + .replace(/</gi, "<") + .replace(/>/gi, ">") + .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) + .replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); +} + +function stripTags(value: string): string { + return decodeEntities(value.replace(/<[^>]+>/g, "")); +} + +function normalizeWhitespace(value: string): string { + return value + .replace(/\r/g, "") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); +} + +function htmlToMarkdown(html: string): { text: string; title?: string } { + const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i); + const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined; + let text = html + .replace(//gi, "") + .replace(//gi, "") + .replace(//gi, ""); + text = text.replace(/]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, body) => { + const label = normalizeWhitespace(stripTags(body)); + if (!label) return href; + return `[${label}](${href})`; + }); + text = text.replace(/]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => { + const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10)))); + const label = normalizeWhitespace(stripTags(body)); + return `\n${prefix} ${label}\n`; + }); + text = text.replace(/]*>([\s\S]*?)<\/li>/gi, (_, body) => { + const label = normalizeWhitespace(stripTags(body)); + return label ? `\n- ${label}` : ""; + }); + text = text + .replace(/<(br|hr)\s*\/?>/gi, "\n") + .replace(/<\/(p|div|section|article|header|footer|table|tr|ul|ol)>/gi, "\n"); + text = stripTags(text); + text = normalizeWhitespace(text); + return { text, title }; +} + +export function markdownToText(markdown: string): string { + let text = markdown; + text = text.replace(/!\[[^\]]*]\([^)]+\)/g, ""); + text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1"); + text = text.replace(/```[\s\S]*?```/g, (block) => + block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""), + ); + text = text.replace(/`([^`]+)`/g, "$1"); + text = text.replace(/^#{1,6}\s+/gm, ""); + text = text.replace(/^\s*[-*+]\s+/gm, ""); + text = text.replace(/^\s*\d+\.\s+/gm, ""); + return normalizeWhitespace(text); +} + +export function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } { + if (value.length <= maxChars) return { text: value, truncated: false }; + return { text: value.slice(0, maxChars), truncated: true }; +} + +export async function extractReadableContent(params: { + html: string; + url: string; + extractMode: ExtractMode; +}): Promise<{ text: string; title?: string } | null> { + try { + const [{ Readability }, { parseHTML }] = await Promise.all([ + import("@mozilla/readability"), + import("linkedom"), + ]); + const { document } = parseHTML(params.html); + try { + (document as { baseURI?: string }).baseURI = params.url; + } catch { + // Best-effort base URI for relative links. + } + const reader = new Readability(document, { charThreshold: 0 }); + const parsed = reader.parse(); + if (!parsed?.content) return null; + const title = parsed.title || undefined; + if (params.extractMode === "text") { + const text = normalizeWhitespace(parsed.textContent ?? ""); + return { text, title }; + } + const rendered = htmlToMarkdown(parsed.content); + return { text: rendered.text, title: title ?? rendered.title }; + } catch { + return null; + } +} diff --git a/src/agents/tools/web-fetch.ts b/src/agents/tools/web-fetch.ts new file mode 100644 index 000000000..721068385 --- /dev/null +++ b/src/agents/tools/web-fetch.ts @@ -0,0 +1,496 @@ +import { Type } from "@sinclair/typebox"; + +import type { ClawdbotConfig } from "../../config/config.js"; +import { stringEnum } from "../schema/typebox.js"; +import type { AnyAgentTool } from "./common.js"; +import { jsonResult, readNumberParam, readStringParam } from "./common.js"; +import { + CacheEntry, + DEFAULT_CACHE_TTL_MINUTES, + DEFAULT_TIMEOUT_SECONDS, + normalizeCacheKey, + readCache, + readResponseText, + resolveCacheTtlMs, + resolveTimeoutSeconds, + withTimeout, + writeCache, +} from "./web-shared.js"; +import { + extractReadableContent, + markdownToText, + truncateText, + type ExtractMode, +} from "./web-fetch-utils.js"; + +export { extractReadableContent } from "./web-fetch-utils.js"; + +const EXTRACT_MODES = ["markdown", "text"] as const; + +const DEFAULT_FETCH_MAX_CHARS = 50_000; +const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev"; +const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000; +const DEFAULT_FETCH_USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; + +const FETCH_CACHE = new Map>>(); + +const WebFetchSchema = Type.Object({ + url: Type.String({ description: "HTTP or HTTPS URL to fetch." }), + extractMode: Type.Optional( + stringEnum(EXTRACT_MODES, { + description: 'Extraction mode ("markdown" or "text").', + default: "markdown", + }), + ), + maxChars: Type.Optional( + Type.Number({ + description: "Maximum characters to return (truncates when exceeded).", + minimum: 100, + }), + ), +}); + +type WebFetchConfig = NonNullable["web"] extends infer Web + ? Web extends { fetch?: infer Fetch } + ? Fetch + : undefined + : undefined; + +type FirecrawlFetchConfig = + | { + enabled?: boolean; + apiKey?: string; + baseUrl?: string; + onlyMainContent?: boolean; + maxAgeMs?: number; + timeoutSeconds?: number; + } + | undefined; + +function resolveFetchConfig(cfg?: ClawdbotConfig): WebFetchConfig { + const fetch = cfg?.tools?.web?.fetch; + if (!fetch || typeof fetch !== "object") return undefined; + return fetch as WebFetchConfig; +} + +function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boolean }): boolean { + if (typeof params.fetch?.enabled === "boolean") return params.fetch.enabled; + return true; +} + +function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean { + if (typeof fetch?.readability === "boolean") return fetch.readability; + return true; +} + +function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig { + if (!fetch || typeof fetch !== "object") return undefined; + const firecrawl = "firecrawl" in fetch ? fetch.firecrawl : undefined; + if (!firecrawl || typeof firecrawl !== "object") return undefined; + return firecrawl as FirecrawlFetchConfig; +} + +function resolveFirecrawlApiKey(firecrawl?: FirecrawlFetchConfig): string | undefined { + const fromConfig = + firecrawl && "apiKey" in firecrawl && typeof firecrawl.apiKey === "string" + ? firecrawl.apiKey.trim() + : ""; + const fromEnv = (process.env.FIRECRAWL_API_KEY ?? "").trim(); + return fromConfig || fromEnv || undefined; +} + +function resolveFirecrawlEnabled(params: { + firecrawl?: FirecrawlFetchConfig; + apiKey?: string; +}): boolean { + if (typeof params.firecrawl?.enabled === "boolean") return params.firecrawl.enabled; + return Boolean(params.apiKey); +} + +function resolveFirecrawlBaseUrl(firecrawl?: FirecrawlFetchConfig): string { + const raw = + firecrawl && "baseUrl" in firecrawl && typeof firecrawl.baseUrl === "string" + ? firecrawl.baseUrl.trim() + : ""; + return raw || DEFAULT_FIRECRAWL_BASE_URL; +} + +function resolveFirecrawlOnlyMainContent(firecrawl?: FirecrawlFetchConfig): boolean { + if (typeof firecrawl?.onlyMainContent === "boolean") return firecrawl.onlyMainContent; + return true; +} + +function resolveFirecrawlMaxAgeMs(firecrawl?: FirecrawlFetchConfig): number | undefined { + const raw = + firecrawl && "maxAgeMs" in firecrawl && typeof firecrawl.maxAgeMs === "number" + ? firecrawl.maxAgeMs + : undefined; + if (typeof raw !== "number" || !Number.isFinite(raw)) return undefined; + const parsed = Math.max(0, Math.floor(raw)); + return parsed > 0 ? parsed : undefined; +} + +function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): number { + const resolved = resolveFirecrawlMaxAgeMs(firecrawl); + if (typeof resolved === "number") return resolved; + return DEFAULT_FIRECRAWL_MAX_AGE_MS; +} + +function resolveMaxChars(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(100, Math.floor(parsed)); +} + +export async function fetchFirecrawlContent(params: { + url: string; + extractMode: ExtractMode; + apiKey: string; + baseUrl: string; + onlyMainContent: boolean; + maxAgeMs: number; + proxy: "auto" | "basic" | "stealth"; + storeInCache: boolean; + timeoutSeconds: number; +}): Promise<{ + text: string; + title?: string; + finalUrl?: string; + status?: number; + warning?: string; +}> { + const endpoint = resolveFirecrawlEndpoint(params.baseUrl); + const body: Record = { + url: params.url, + formats: ["markdown"], + onlyMainContent: params.onlyMainContent, + timeout: params.timeoutSeconds * 1000, + maxAge: params.maxAgeMs, + proxy: params.proxy, + storeInCache: params.storeInCache, + }; + + const res = await fetch(endpoint, { + method: "POST", + headers: { + Authorization: `Bearer ${params.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + const payload = (await res.json()) as { + success?: boolean; + data?: { + markdown?: string; + content?: string; + metadata?: { + title?: string; + sourceURL?: string; + statusCode?: number; + }; + }; + warning?: string; + error?: string; + }; + + if (!res.ok || payload?.success === false) { + const detail = payload?.error || res.statusText; + throw new Error(`Firecrawl fetch failed (${res.status}): ${detail}`.trim()); + } + + const data = payload?.data ?? {}; + const rawText = + typeof data.markdown === "string" + ? data.markdown + : typeof data.content === "string" + ? data.content + : ""; + const text = params.extractMode === "text" ? markdownToText(rawText) : rawText; + return { + text, + title: data.metadata?.title, + finalUrl: data.metadata?.sourceURL, + status: data.metadata?.statusCode, + warning: payload?.warning, + }; +} + +async function runWebFetch(params: { + url: string; + extractMode: ExtractMode; + maxChars: number; + timeoutSeconds: number; + cacheTtlMs: number; + userAgent: string; + readabilityEnabled: boolean; + firecrawlEnabled: boolean; + firecrawlApiKey?: string; + firecrawlBaseUrl: string; + firecrawlOnlyMainContent: boolean; + firecrawlMaxAgeMs: number; + firecrawlProxy: "auto" | "basic" | "stealth"; + firecrawlStoreInCache: boolean; + firecrawlTimeoutSeconds: number; +}): Promise> { + const cacheKey = normalizeCacheKey( + `fetch:${params.url}:${params.extractMode}:${params.maxChars}`, + ); + const cached = readCache(FETCH_CACHE, cacheKey); + if (cached) return { ...cached.value, cached: true }; + + let parsedUrl: URL; + try { + parsedUrl = new URL(params.url); + } catch { + throw new Error("Invalid URL: must be http or https"); + } + if (!["http:", "https:"].includes(parsedUrl.protocol)) { + throw new Error("Invalid URL: must be http or https"); + } + + const start = Date.now(); + let res: Response; + try { + res = await fetch(parsedUrl.toString(), { + method: "GET", + headers: { + Accept: "*/*", + "User-Agent": params.userAgent, + "Accept-Language": "en-US,en;q=0.9", + }, + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + } catch (error) { + if (params.firecrawlEnabled && params.firecrawlApiKey) { + const firecrawl = await fetchFirecrawlContent({ + url: params.url, + extractMode: params.extractMode, + apiKey: params.firecrawlApiKey, + baseUrl: params.firecrawlBaseUrl, + onlyMainContent: params.firecrawlOnlyMainContent, + maxAgeMs: params.firecrawlMaxAgeMs, + proxy: params.firecrawlProxy, + storeInCache: params.firecrawlStoreInCache, + timeoutSeconds: params.firecrawlTimeoutSeconds, + }); + const truncated = truncateText(firecrawl.text, params.maxChars); + const payload = { + url: params.url, + finalUrl: firecrawl.finalUrl || params.url, + status: firecrawl.status ?? 200, + contentType: "text/markdown", + title: firecrawl.title, + extractMode: params.extractMode, + extractor: "firecrawl", + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + warning: firecrawl.warning, + }; + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } + throw error; + } + + if (!res.ok) { + if (params.firecrawlEnabled && params.firecrawlApiKey) { + const firecrawl = await fetchFirecrawlContent({ + url: params.url, + extractMode: params.extractMode, + apiKey: params.firecrawlApiKey, + baseUrl: params.firecrawlBaseUrl, + onlyMainContent: params.firecrawlOnlyMainContent, + maxAgeMs: params.firecrawlMaxAgeMs, + proxy: params.firecrawlProxy, + storeInCache: params.firecrawlStoreInCache, + timeoutSeconds: params.firecrawlTimeoutSeconds, + }); + const truncated = truncateText(firecrawl.text, params.maxChars); + const payload = { + url: params.url, + finalUrl: firecrawl.finalUrl || params.url, + status: firecrawl.status ?? res.status, + contentType: "text/markdown", + title: firecrawl.title, + extractMode: params.extractMode, + extractor: "firecrawl", + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + warning: firecrawl.warning, + }; + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } + const detail = await readResponseText(res); + throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); + } + + const contentType = res.headers.get("content-type") ?? "application/octet-stream"; + const body = await readResponseText(res); + + let title: string | undefined; + let extractor = "raw"; + let text = body; + if (contentType.includes("text/html")) { + if (params.readabilityEnabled) { + const readable = await extractReadableContent({ + html: body, + url: res.url || params.url, + extractMode: params.extractMode, + }); + if (readable?.text) { + text = readable.text; + title = readable.title; + extractor = "readability"; + } else { + const firecrawl = await tryFirecrawlFallback(params); + if (firecrawl) { + text = firecrawl.text; + title = firecrawl.title; + extractor = "firecrawl"; + } else { + throw new Error( + "Web fetch extraction failed: Readability and Firecrawl returned no content.", + ); + } + } + } else { + throw new Error( + "Web fetch extraction failed: Readability disabled and Firecrawl unavailable.", + ); + } + } else if (contentType.includes("application/json")) { + try { + text = JSON.stringify(JSON.parse(body), null, 2); + extractor = "json"; + } catch { + text = body; + extractor = "raw"; + } + } + + const truncated = truncateText(text, params.maxChars); + const payload = { + url: params.url, + finalUrl: res.url || params.url, + status: res.status, + contentType, + title, + extractMode: params.extractMode, + extractor, + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + }; + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; +} + +async function tryFirecrawlFallback(params: { + url: string; + extractMode: ExtractMode; + firecrawlEnabled: boolean; + firecrawlApiKey?: string; + firecrawlBaseUrl: string; + firecrawlOnlyMainContent: boolean; + firecrawlMaxAgeMs: number; + firecrawlProxy: "auto" | "basic" | "stealth"; + firecrawlStoreInCache: boolean; + firecrawlTimeoutSeconds: number; +}): Promise<{ text: string; title?: string } | null> { + if (!params.firecrawlEnabled || !params.firecrawlApiKey) return null; + try { + const firecrawl = await fetchFirecrawlContent({ + url: params.url, + extractMode: params.extractMode, + apiKey: params.firecrawlApiKey, + baseUrl: params.firecrawlBaseUrl, + onlyMainContent: params.firecrawlOnlyMainContent, + maxAgeMs: params.firecrawlMaxAgeMs, + proxy: params.firecrawlProxy, + storeInCache: params.firecrawlStoreInCache, + timeoutSeconds: params.firecrawlTimeoutSeconds, + }); + return { text: firecrawl.text, title: firecrawl.title }; + } catch { + return null; + } +} + +function resolveFirecrawlEndpoint(baseUrl: string): string { + const trimmed = baseUrl.trim(); + if (!trimmed) return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`; + try { + const url = new URL(trimmed); + if (url.pathname && url.pathname !== "/") { + return url.toString(); + } + url.pathname = "/v2/scrape"; + return url.toString(); + } catch { + return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`; + } +} + +export function createWebFetchTool(options?: { + config?: ClawdbotConfig; + sandboxed?: boolean; +}): AnyAgentTool | null { + const fetch = resolveFetchConfig(options?.config); + if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null; + const readabilityEnabled = resolveFetchReadabilityEnabled(fetch); + const firecrawl = resolveFirecrawlConfig(fetch); + const firecrawlApiKey = resolveFirecrawlApiKey(firecrawl); + const firecrawlEnabled = resolveFirecrawlEnabled({ firecrawl, apiKey: firecrawlApiKey }); + const firecrawlBaseUrl = resolveFirecrawlBaseUrl(firecrawl); + const firecrawlOnlyMainContent = resolveFirecrawlOnlyMainContent(firecrawl); + const firecrawlMaxAgeMs = resolveFirecrawlMaxAgeMsOrDefault(firecrawl); + const firecrawlTimeoutSeconds = resolveTimeoutSeconds( + firecrawl?.timeoutSeconds ?? fetch?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS, + ); + const userAgent = + (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || + DEFAULT_FETCH_USER_AGENT; + return { + label: "Web Fetch", + name: "web_fetch", + description: + "Fetch and extract readable content from a URL (HTML → markdown/text). Use for lightweight page access without browser automation.", + parameters: WebFetchSchema, + execute: async (_toolCallId, args) => { + const params = args as Record; + const url = readStringParam(params, "url", { required: true }); + const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown"; + const maxChars = readNumberParam(params, "maxChars", { integer: true }); + const result = await runWebFetch({ + url, + extractMode, + maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS), + timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), + cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), + userAgent, + readabilityEnabled, + firecrawlEnabled, + firecrawlApiKey, + firecrawlBaseUrl, + firecrawlOnlyMainContent, + firecrawlMaxAgeMs, + firecrawlProxy: "auto", + firecrawlStoreInCache: true, + firecrawlTimeoutSeconds, + }); + return jsonResult(result); + }, + }; +} diff --git a/src/agents/tools/web-search.ts b/src/agents/tools/web-search.ts new file mode 100644 index 000000000..c8b8eaae7 --- /dev/null +++ b/src/agents/tools/web-search.ts @@ -0,0 +1,398 @@ +import { Type } from "@sinclair/typebox"; + +import type { ClawdbotConfig } from "../../config/config.js"; +import type { AnyAgentTool } from "./common.js"; +import { jsonResult, readNumberParam, readStringParam } from "./common.js"; +import { + CacheEntry, + DEFAULT_CACHE_TTL_MINUTES, + DEFAULT_TIMEOUT_SECONDS, + normalizeCacheKey, + readCache, + readResponseText, + resolveCacheTtlMs, + resolveTimeoutSeconds, + withTimeout, + writeCache, +} from "./web-shared.js"; + +const SEARCH_PROVIDERS = ["brave", "perplexity"] as const; +const DEFAULT_SEARCH_COUNT = 5; +const MAX_SEARCH_COUNT = 10; + +const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"; +const DEFAULT_PERPLEXITY_BASE_URL = "https://openrouter.ai/api/v1"; +const PERPLEXITY_DIRECT_BASE_URL = "https://api.perplexity.ai"; +const DEFAULT_PERPLEXITY_MODEL = "perplexity/sonar-pro"; + +const SEARCH_CACHE = new Map>>(); + +const WebSearchSchema = Type.Object({ + query: Type.String({ description: "Search query string." }), + count: Type.Optional( + Type.Number({ + description: "Number of results to return (1-10).", + minimum: 1, + maximum: MAX_SEARCH_COUNT, + }), + ), + country: Type.Optional( + Type.String({ + description: + "2-letter country code for region-specific results (e.g., 'DE', 'US', 'ALL'). Default: 'US'.", + }), + ), + search_lang: Type.Optional( + Type.String({ + description: "ISO language code for search results (e.g., 'de', 'en', 'fr').", + }), + ), + ui_lang: Type.Optional( + Type.String({ + description: "ISO language code for UI elements.", + }), + ), +}); + +type WebSearchConfig = NonNullable["web"] extends infer Web + ? Web extends { search?: infer Search } + ? Search + : undefined + : undefined; + +type BraveSearchResult = { + title?: string; + url?: string; + description?: string; + age?: string; +}; + +type BraveSearchResponse = { + web?: { + results?: BraveSearchResult[]; + }; +}; + +type PerplexityConfig = { + apiKey?: string; + baseUrl?: string; + model?: string; +}; + +type PerplexityApiKeySource = + | "config" + | "perplexity_env" + | "openrouter_env" + | "none"; + +type PerplexitySearchResponse = { + choices?: Array<{ + message?: { + content?: string; + }; + }>; + citations?: string[]; +}; + +function resolveSearchConfig(cfg?: ClawdbotConfig): WebSearchConfig { + const search = cfg?.tools?.web?.search; + if (!search || typeof search !== "object") return undefined; + return search as WebSearchConfig; +} + +function resolveSearchEnabled(params: { search?: WebSearchConfig; sandboxed?: boolean }): boolean { + if (typeof params.search?.enabled === "boolean") return params.search.enabled; + if (params.sandboxed) return true; + return true; +} + +function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { + const fromConfig = + search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : ""; + const fromEnv = (process.env.BRAVE_API_KEY ?? "").trim(); + return fromConfig || fromEnv || undefined; +} + +function missingSearchKeyPayload(provider: (typeof SEARCH_PROVIDERS)[number]) { + if (provider === "perplexity") { + return { + error: "missing_perplexity_api_key", + message: + "web_search (perplexity) needs an API key. Set PERPLEXITY_API_KEY or OPENROUTER_API_KEY in the Gateway environment, or configure tools.web.search.perplexity.apiKey.", + docs: "https://docs.clawd.bot/tools/web", + }; + } + return { + error: "missing_brave_api_key", + message: + "web_search needs a Brave Search API key. Run `clawdbot configure --section web` to store it, or set BRAVE_API_KEY in the Gateway environment.", + docs: "https://docs.clawd.bot/tools/web", + }; +} + +function resolveSearchProvider(search?: WebSearchConfig): (typeof SEARCH_PROVIDERS)[number] { + const raw = + search && "provider" in search && typeof search.provider === "string" + ? search.provider.trim().toLowerCase() + : ""; + if (raw === "perplexity") return "perplexity"; + if (raw === "brave") return "brave"; + return "brave"; +} + +function resolvePerplexityConfig(search?: WebSearchConfig): PerplexityConfig { + if (!search || typeof search !== "object") return {}; + const perplexity = "perplexity" in search ? search.perplexity : undefined; + if (!perplexity || typeof perplexity !== "object") return {}; + return perplexity as PerplexityConfig; +} + +function resolvePerplexityApiKey(perplexity?: PerplexityConfig): { + apiKey?: string; + source: PerplexityApiKeySource; +} { + const fromConfig = + perplexity && "apiKey" in perplexity && typeof perplexity.apiKey === "string" + ? perplexity.apiKey.trim() + : ""; + if (fromConfig) { + return { apiKey: fromConfig, source: "config" }; + } + + const fromEnvPerplexity = (process.env.PERPLEXITY_API_KEY ?? "").trim(); + if (fromEnvPerplexity) { + return { apiKey: fromEnvPerplexity, source: "perplexity_env" }; + } + + const fromEnvOpenRouter = (process.env.OPENROUTER_API_KEY ?? "").trim(); + if (fromEnvOpenRouter) { + return { apiKey: fromEnvOpenRouter, source: "openrouter_env" }; + } + + return { apiKey: undefined, source: "none" }; +} + +function resolvePerplexityBaseUrl( + perplexity?: PerplexityConfig, + apiKeySource: PerplexityApiKeySource = "none", +): string { + const fromConfig = + perplexity && "baseUrl" in perplexity && typeof perplexity.baseUrl === "string" + ? perplexity.baseUrl.trim() + : ""; + if (fromConfig) return fromConfig; + if (apiKeySource === "perplexity_env") return PERPLEXITY_DIRECT_BASE_URL; + return DEFAULT_PERPLEXITY_BASE_URL; +} + +function resolvePerplexityModel(perplexity?: PerplexityConfig): string { + const fromConfig = + perplexity && "model" in perplexity && typeof perplexity.model === "string" + ? perplexity.model.trim() + : ""; + return fromConfig || DEFAULT_PERPLEXITY_MODEL; +} + +function resolveSearchCount(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + const clamped = Math.max(1, Math.min(MAX_SEARCH_COUNT, Math.floor(parsed))); + return clamped; +} + +function resolveSiteName(url: string | undefined): string | undefined { + if (!url) return undefined; + try { + return new URL(url).hostname; + } catch { + return undefined; + } +} + +async function runPerplexitySearch(params: { + query: string; + apiKey: string; + baseUrl: string; + model: string; + timeoutSeconds: number; +}): Promise<{ content: string; citations: string[] }> { + const endpoint = `${params.baseUrl.replace(/\/$/, "")}/chat/completions`; + + const res = await fetch(endpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${params.apiKey}`, + "HTTP-Referer": "https://clawdbot.com", + "X-Title": "Clawdbot Web Search", + }, + body: JSON.stringify({ + model: params.model, + messages: [ + { + role: "user", + content: params.query, + }, + ], + }), + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + if (!res.ok) { + const detail = await readResponseText(res); + throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`); + } + + const data = (await res.json()) as PerplexitySearchResponse; + const content = data.choices?.[0]?.message?.content ?? "No response"; + const citations = data.citations ?? []; + + return { content, citations }; +} + +async function runWebSearch(params: { + query: string; + count: number; + apiKey: string; + timeoutSeconds: number; + cacheTtlMs: number; + provider: (typeof SEARCH_PROVIDERS)[number]; + country?: string; + search_lang?: string; + ui_lang?: string; + perplexityBaseUrl?: string; + perplexityModel?: string; +}): Promise> { + const cacheKey = normalizeCacheKey( + `${params.provider}:${params.query}:${params.count}:${params.country || "default"}:${params.search_lang || "default"}:${params.ui_lang || "default"}`, + ); + const cached = readCache(SEARCH_CACHE, cacheKey); + if (cached) return { ...cached.value, cached: true }; + + const start = Date.now(); + + if (params.provider === "perplexity") { + const { content, citations } = await runPerplexitySearch({ + query: params.query, + apiKey: params.apiKey, + baseUrl: params.perplexityBaseUrl ?? DEFAULT_PERPLEXITY_BASE_URL, + model: params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL, + timeoutSeconds: params.timeoutSeconds, + }); + + const payload = { + query: params.query, + provider: params.provider, + model: params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL, + tookMs: Date.now() - start, + content, + citations, + }; + writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } + + if (params.provider !== "brave") { + throw new Error("Unsupported web search provider."); + } + + const url = new URL(BRAVE_SEARCH_ENDPOINT); + url.searchParams.set("q", params.query); + url.searchParams.set("count", String(params.count)); + if (params.country) { + url.searchParams.set("country", params.country); + } + if (params.search_lang) { + url.searchParams.set("search_lang", params.search_lang); + } + if (params.ui_lang) { + url.searchParams.set("ui_lang", params.ui_lang); + } + + const res = await fetch(url.toString(), { + method: "GET", + headers: { + Accept: "application/json", + "X-Subscription-Token": params.apiKey, + }, + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + if (!res.ok) { + const detail = await readResponseText(res); + throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`); + } + + const data = (await res.json()) as BraveSearchResponse; + const results = Array.isArray(data.web?.results) ? (data.web?.results ?? []) : []; + const mapped = results.map((entry) => ({ + title: entry.title ?? "", + url: entry.url ?? "", + description: entry.description ?? "", + published: entry.age ?? undefined, + siteName: resolveSiteName(entry.url ?? ""), + })); + + const payload = { + query: params.query, + provider: params.provider, + count: mapped.length, + tookMs: Date.now() - start, + results: mapped, + }; + writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; +} + +export function createWebSearchTool(options?: { + config?: ClawdbotConfig; + sandboxed?: boolean; +}): AnyAgentTool | null { + const search = resolveSearchConfig(options?.config); + if (!resolveSearchEnabled({ search, sandboxed: options?.sandboxed })) return null; + + const provider = resolveSearchProvider(search); + const perplexityConfig = resolvePerplexityConfig(search); + + const description = + provider === "perplexity" + ? "Search the web using Perplexity Sonar (direct or via OpenRouter). Returns AI-synthesized answers with citations from real-time web search." + : "Search the web using Brave Search API. Supports region-specific and localized search via country and language parameters. Returns titles, URLs, and snippets for fast research."; + + return { + label: "Web Search", + name: "web_search", + description, + parameters: WebSearchSchema, + execute: async (_toolCallId, args) => { + const perplexityAuth = + provider === "perplexity" ? resolvePerplexityApiKey(perplexityConfig) : undefined; + const apiKey = + provider === "perplexity" ? perplexityAuth?.apiKey : resolveSearchApiKey(search); + + if (!apiKey) { + return jsonResult(missingSearchKeyPayload(provider)); + } + const params = args as Record; + const query = readStringParam(params, "query", { required: true }); + const count = + readNumberParam(params, "count", { integer: true }) ?? search?.maxResults ?? undefined; + const country = readStringParam(params, "country"); + const search_lang = readStringParam(params, "search_lang"); + const ui_lang = readStringParam(params, "ui_lang"); + const result = await runWebSearch({ + query, + count: resolveSearchCount(count, DEFAULT_SEARCH_COUNT), + apiKey, + timeoutSeconds: resolveTimeoutSeconds(search?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), + cacheTtlMs: resolveCacheTtlMs(search?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), + provider, + country, + search_lang, + ui_lang, + perplexityBaseUrl: resolvePerplexityBaseUrl(perplexityConfig, perplexityAuth?.source), + perplexityModel: resolvePerplexityModel(perplexityConfig), + }); + return jsonResult(result); + }, + }; +} diff --git a/src/agents/tools/web-shared.ts b/src/agents/tools/web-shared.ts new file mode 100644 index 000000000..52876a7e4 --- /dev/null +++ b/src/agents/tools/web-shared.ts @@ -0,0 +1,82 @@ +export type CacheEntry = { + value: T; + expiresAt: number; + insertedAt: number; +}; + +export const DEFAULT_TIMEOUT_SECONDS = 30; +export const DEFAULT_CACHE_TTL_MINUTES = 15; +const DEFAULT_CACHE_MAX_ENTRIES = 100; + +export function resolveTimeoutSeconds(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(1, Math.floor(parsed)); +} + +export function resolveCacheTtlMs(value: unknown, fallbackMinutes: number): number { + const minutes = + typeof value === "number" && Number.isFinite(value) ? Math.max(0, value) : fallbackMinutes; + return Math.round(minutes * 60_000); +} + +export function normalizeCacheKey(value: string): string { + return value.trim().toLowerCase(); +} + +export function readCache( + cache: Map>, + key: string, +): { value: T; cached: boolean } | null { + const entry = cache.get(key); + if (!entry) return null; + if (Date.now() > entry.expiresAt) { + cache.delete(key); + return null; + } + return { value: entry.value, cached: true }; +} + +export function writeCache(cache: Map>, key: string, value: T, ttlMs: number) { + if (ttlMs <= 0) return; + if (cache.size >= DEFAULT_CACHE_MAX_ENTRIES) { + const oldest = cache.keys().next(); + if (!oldest.done) cache.delete(oldest.value); + } + cache.set(key, { + value, + expiresAt: Date.now() + ttlMs, + insertedAt: Date.now(), + }); +} + +export function withTimeout(signal: AbortSignal | undefined, timeoutMs: number): AbortSignal { + if (timeoutMs <= 0) return signal ?? new AbortController().signal; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + if (signal) { + signal.addEventListener( + "abort", + () => { + clearTimeout(timer); + controller.abort(); + }, + { once: true }, + ); + } + controller.signal.addEventListener( + "abort", + () => { + clearTimeout(timer); + }, + { once: true }, + ); + return controller.signal; +} + +export async function readResponseText(res: Response): Promise { + try { + return await res.text(); + } catch { + return ""; + } +} diff --git a/src/agents/tools/web-tools.enabled-defaults.test.ts b/src/agents/tools/web-tools.enabled-defaults.test.ts index 979a161d2..a1e72a9e5 100644 --- a/src/agents/tools/web-tools.enabled-defaults.test.ts +++ b/src/agents/tools/web-tools.enabled-defaults.test.ts @@ -89,3 +89,57 @@ describe("web_search country and language parameters", () => { expect(url.searchParams.get("ui_lang")).toBe("de"); }); }); + +describe("web_search perplexity baseUrl defaults", () => { + const priorFetch = global.fetch; + + afterEach(() => { + vi.unstubAllEnvs(); + // @ts-expect-error global fetch cleanup + global.fetch = priorFetch; + }); + + it("defaults to Perplexity direct when PERPLEXITY_API_KEY is set", async () => { + vi.stubEnv("PERPLEXITY_API_KEY", "pplx-test"); + const mockFetch = vi.fn(() => + Promise.resolve({ + ok: true, + json: () => + Promise.resolve({ choices: [{ message: { content: "ok" } }], citations: [] }), + } as Response), + ); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebSearchTool({ + config: { tools: { web: { search: { provider: "perplexity" } } } }, + sandboxed: true, + }); + await tool?.execute?.(1, { query: "test-openrouter" }); + + expect(mockFetch).toHaveBeenCalled(); + expect(mockFetch.mock.calls[0]?.[0]).toBe("https://api.perplexity.ai/chat/completions"); + }); + + it("defaults to OpenRouter when OPENROUTER_API_KEY is set", async () => { + vi.stubEnv("OPENROUTER_API_KEY", "sk-or-test"); + const mockFetch = vi.fn(() => + Promise.resolve({ + ok: true, + json: () => + Promise.resolve({ choices: [{ message: { content: "ok" } }], citations: [] }), + } as Response), + ); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebSearchTool({ + config: { tools: { web: { search: { provider: "perplexity" } } } }, + sandboxed: true, + }); + await tool?.execute?.(1, { query: "test" }); + + expect(mockFetch).toHaveBeenCalled(); + expect(mockFetch.mock.calls[0]?.[0]).toBe("https://openrouter.ai/api/v1/chat/completions"); + }); +}); diff --git a/src/agents/tools/web-tools.ts b/src/agents/tools/web-tools.ts index 6efa53ee1..3acaa4c76 100644 --- a/src/agents/tools/web-tools.ts +++ b/src/agents/tools/web-tools.ts @@ -1,1015 +1,2 @@ -import { Type } from "@sinclair/typebox"; - -import type { ClawdbotConfig } from "../../config/config.js"; -import { stringEnum } from "../schema/typebox.js"; -import type { AnyAgentTool } from "./common.js"; -import { jsonResult, readNumberParam, readStringParam } from "./common.js"; - -const SEARCH_PROVIDERS = ["brave", "perplexity"] as const; -const EXTRACT_MODES = ["markdown", "text"] as const; - -const DEFAULT_SEARCH_COUNT = 5; -const MAX_SEARCH_COUNT = 10; -const DEFAULT_FETCH_MAX_CHARS = 50_000; -const DEFAULT_TIMEOUT_SECONDS = 30; -const DEFAULT_CACHE_TTL_MINUTES = 15; -const DEFAULT_CACHE_MAX_ENTRIES = 100; -const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev"; -const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000; -const DEFAULT_FETCH_USER_AGENT = - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; - -const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"; -const DEFAULT_PERPLEXITY_BASE_URL = "https://openrouter.ai/api/v1"; -const DEFAULT_PERPLEXITY_MODEL = "perplexity/sonar-pro"; - -type WebSearchConfig = NonNullable["web"] extends infer Web - ? Web extends { search?: infer Search } - ? Search - : undefined - : undefined; - -type WebFetchConfig = NonNullable["web"] extends infer Web - ? Web extends { fetch?: infer Fetch } - ? Fetch - : undefined - : undefined; - -type FirecrawlFetchConfig = - | { - enabled?: boolean; - apiKey?: string; - baseUrl?: string; - onlyMainContent?: boolean; - maxAgeMs?: number; - timeoutSeconds?: number; - } - | undefined; -type CacheEntry = { - value: T; - expiresAt: number; - insertedAt: number; -}; - -const SEARCH_CACHE = new Map>>(); -const FETCH_CACHE = new Map>>(); - -const WebSearchSchema = Type.Object({ - query: Type.String({ description: "Search query string." }), - count: Type.Optional( - Type.Number({ - description: "Number of results to return (1-10).", - minimum: 1, - maximum: MAX_SEARCH_COUNT, - }), - ), - country: Type.Optional( - Type.String({ - description: - "2-letter country code for region-specific results (e.g., 'DE', 'US', 'ALL'). Default: 'US'.", - }), - ), - search_lang: Type.Optional( - Type.String({ - description: "ISO language code for search results (e.g., 'de', 'en', 'fr').", - }), - ), - ui_lang: Type.Optional( - Type.String({ - description: "ISO language code for UI elements.", - }), - ), -}); - -const WebFetchSchema = Type.Object({ - url: Type.String({ description: "HTTP or HTTPS URL to fetch." }), - extractMode: Type.Optional( - stringEnum(EXTRACT_MODES, { - description: 'Extraction mode ("markdown" or "text").', - default: "markdown", - }), - ), - maxChars: Type.Optional( - Type.Number({ - description: "Maximum characters to return (truncates when exceeded).", - minimum: 100, - }), - ), -}); - -type BraveSearchResult = { - title?: string; - url?: string; - description?: string; - age?: string; -}; - -type BraveSearchResponse = { - web?: { - results?: BraveSearchResult[]; - }; -}; - -function resolveSearchConfig(cfg?: ClawdbotConfig): WebSearchConfig { - const search = cfg?.tools?.web?.search; - if (!search || typeof search !== "object") return undefined; - return search as WebSearchConfig; -} - -function resolveFetchConfig(cfg?: ClawdbotConfig): WebFetchConfig { - const fetch = cfg?.tools?.web?.fetch; - if (!fetch || typeof fetch !== "object") return undefined; - return fetch as WebFetchConfig; -} - -function resolveSearchEnabled(params: { search?: WebSearchConfig; sandboxed?: boolean }): boolean { - if (typeof params.search?.enabled === "boolean") return params.search.enabled; - if (params.sandboxed) return true; - return true; -} - -function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boolean }): boolean { - if (typeof params.fetch?.enabled === "boolean") return params.fetch.enabled; - return true; -} - -function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean { - if (typeof fetch?.readability === "boolean") return fetch.readability; - return true; -} - -function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig { - if (!fetch || typeof fetch !== "object") return undefined; - const firecrawl = "firecrawl" in fetch ? fetch.firecrawl : undefined; - if (!firecrawl || typeof firecrawl !== "object") return undefined; - return firecrawl as FirecrawlFetchConfig; -} - -function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { - const fromConfig = - search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : ""; - const fromEnv = (process.env.BRAVE_API_KEY ?? "").trim(); - return fromConfig || fromEnv || undefined; -} - -function resolveFirecrawlApiKey(firecrawl?: FirecrawlFetchConfig): string | undefined { - const fromConfig = - firecrawl && "apiKey" in firecrawl && typeof firecrawl.apiKey === "string" - ? firecrawl.apiKey.trim() - : ""; - const fromEnv = (process.env.FIRECRAWL_API_KEY ?? "").trim(); - return fromConfig || fromEnv || undefined; -} - -function resolveFirecrawlEnabled(params: { - firecrawl?: FirecrawlFetchConfig; - apiKey?: string; -}): boolean { - if (typeof params.firecrawl?.enabled === "boolean") return params.firecrawl.enabled; - return Boolean(params.apiKey); -} - -function resolveFirecrawlBaseUrl(firecrawl?: FirecrawlFetchConfig): string { - const raw = - firecrawl && "baseUrl" in firecrawl && typeof firecrawl.baseUrl === "string" - ? firecrawl.baseUrl.trim() - : ""; - return raw || DEFAULT_FIRECRAWL_BASE_URL; -} - -function resolveFirecrawlOnlyMainContent(firecrawl?: FirecrawlFetchConfig): boolean { - if (typeof firecrawl?.onlyMainContent === "boolean") return firecrawl.onlyMainContent; - return true; -} - -function resolveFirecrawlMaxAgeMs(firecrawl?: FirecrawlFetchConfig): number | undefined { - const raw = - firecrawl && "maxAgeMs" in firecrawl && typeof firecrawl.maxAgeMs === "number" - ? firecrawl.maxAgeMs - : undefined; - if (typeof raw !== "number" || !Number.isFinite(raw)) return undefined; - const parsed = Math.max(0, Math.floor(raw)); - return parsed > 0 ? parsed : undefined; -} - -function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): number { - const resolved = resolveFirecrawlMaxAgeMs(firecrawl); - if (typeof resolved === "number") return resolved; - return DEFAULT_FIRECRAWL_MAX_AGE_MS; -} - -function missingSearchKeyPayload(provider: (typeof SEARCH_PROVIDERS)[number]) { - if (provider === "perplexity") { - return { - error: "missing_perplexity_api_key", - message: - "web_search (perplexity) needs an API key. Set PERPLEXITY_API_KEY or OPENROUTER_API_KEY in the Gateway environment, or configure tools.web.search.perplexity.apiKey.", - docs: "https://docs.clawd.bot/tools/web", - }; - } - return { - error: "missing_brave_api_key", - message: - "web_search needs a Brave Search API key. Run `clawdbot configure --section web` to store it, or set BRAVE_API_KEY in the Gateway environment.", - docs: "https://docs.clawd.bot/tools/web", - }; -} - -function resolveSearchProvider(search?: WebSearchConfig): (typeof SEARCH_PROVIDERS)[number] { - const raw = - search && "provider" in search && typeof search.provider === "string" - ? search.provider.trim().toLowerCase() - : ""; - if (raw === "perplexity") return "perplexity"; - if (raw === "brave") return "brave"; - return "brave"; -} - -type PerplexityConfig = { - apiKey?: string; - baseUrl?: string; - model?: string; -}; - -function resolvePerplexityConfig(search?: WebSearchConfig): PerplexityConfig { - if (!search || typeof search !== "object") return {}; - const perplexity = "perplexity" in search ? search.perplexity : undefined; - if (!perplexity || typeof perplexity !== "object") return {}; - return perplexity as PerplexityConfig; -} - -function resolvePerplexityApiKey(perplexity?: PerplexityConfig): string | undefined { - const fromConfig = - perplexity && "apiKey" in perplexity && typeof perplexity.apiKey === "string" - ? perplexity.apiKey.trim() - : ""; - const fromEnvPerplexity = (process.env.PERPLEXITY_API_KEY ?? "").trim(); - const fromEnvOpenRouter = (process.env.OPENROUTER_API_KEY ?? "").trim(); - return fromConfig || fromEnvPerplexity || fromEnvOpenRouter || undefined; -} - -function resolvePerplexityBaseUrl(perplexity?: PerplexityConfig): string { - const fromConfig = - perplexity && "baseUrl" in perplexity && typeof perplexity.baseUrl === "string" - ? perplexity.baseUrl.trim() - : ""; - return fromConfig || DEFAULT_PERPLEXITY_BASE_URL; -} - -function resolvePerplexityModel(perplexity?: PerplexityConfig): string { - const fromConfig = - perplexity && "model" in perplexity && typeof perplexity.model === "string" - ? perplexity.model.trim() - : ""; - return fromConfig || DEFAULT_PERPLEXITY_MODEL; -} - -function resolveTimeoutSeconds(value: unknown, fallback: number): number { - const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; - return Math.max(1, Math.floor(parsed)); -} - -function resolveCacheTtlMs(value: unknown, fallbackMinutes: number): number { - const minutes = - typeof value === "number" && Number.isFinite(value) ? Math.max(0, value) : fallbackMinutes; - return Math.round(minutes * 60_000); -} - -function resolveMaxChars(value: unknown, fallback: number): number { - const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; - return Math.max(100, Math.floor(parsed)); -} - -function resolveSearchCount(value: unknown, fallback: number): number { - const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; - const clamped = Math.max(1, Math.min(MAX_SEARCH_COUNT, Math.floor(parsed))); - return clamped; -} - -function normalizeCacheKey(value: string): string { - return value.trim().toLowerCase(); -} - -function readCache( - cache: Map>, - key: string, -): { value: T; cached: boolean } | null { - const entry = cache.get(key); - if (!entry) return null; - if (Date.now() > entry.expiresAt) { - cache.delete(key); - return null; - } - return { value: entry.value, cached: true }; -} - -function writeCache(cache: Map>, key: string, value: T, ttlMs: number) { - if (ttlMs <= 0) return; - if (cache.size >= DEFAULT_CACHE_MAX_ENTRIES) { - const oldest = cache.keys().next(); - if (!oldest.done) cache.delete(oldest.value); - } - cache.set(key, { - value, - expiresAt: Date.now() + ttlMs, - insertedAt: Date.now(), - }); -} - -function withTimeout(signal: AbortSignal | undefined, timeoutMs: number): AbortSignal { - if (timeoutMs <= 0) return signal ?? new AbortController().signal; - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), timeoutMs); - if (signal) { - signal.addEventListener( - "abort", - () => { - clearTimeout(timer); - controller.abort(); - }, - { once: true }, - ); - } - controller.signal.addEventListener( - "abort", - () => { - clearTimeout(timer); - }, - { once: true }, - ); - return controller.signal; -} - -function decodeEntities(value: string): string { - return value - .replace(/ /gi, " ") - .replace(/&/gi, "&") - .replace(/"/gi, '"') - .replace(/'/gi, "'") - .replace(/</gi, "<") - .replace(/>/gi, ">") - .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) - .replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); -} - -function stripTags(value: string): string { - return decodeEntities(value.replace(/<[^>]+>/g, "")); -} - -function normalizeWhitespace(value: string): string { - return value - .replace(/\r/g, "") - .replace(/[ \t]+\n/g, "\n") - .replace(/\n{3,}/g, "\n\n") - .replace(/[ \t]{2,}/g, " ") - .trim(); -} - -function htmlToMarkdown(html: string): { text: string; title?: string } { - const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i); - const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined; - let text = html - .replace(//gi, "") - .replace(//gi, "") - .replace(//gi, ""); - text = text.replace(/]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, body) => { - const label = normalizeWhitespace(stripTags(body)); - if (!label) return href; - return `[${label}](${href})`; - }); - text = text.replace(/]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => { - const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10)))); - const label = normalizeWhitespace(stripTags(body)); - return `\n${prefix} ${label}\n`; - }); - text = text.replace(/]*>([\s\S]*?)<\/li>/gi, (_, body) => { - const label = normalizeWhitespace(stripTags(body)); - return label ? `\n- ${label}` : ""; - }); - text = text - .replace(/<(br|hr)\s*\/?>/gi, "\n") - .replace(/<\/(p|div|section|article|header|footer|table|tr|ul|ol)>/gi, "\n"); - text = stripTags(text); - text = normalizeWhitespace(text); - return { text, title }; -} - -function markdownToText(markdown: string): string { - let text = markdown; - text = text.replace(/!\[[^\]]*]\([^)]+\)/g, ""); - text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1"); - text = text.replace(/```[\s\S]*?```/g, (block) => - block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""), - ); - text = text.replace(/`([^`]+)`/g, "$1"); - text = text.replace(/^#{1,6}\s+/gm, ""); - text = text.replace(/^\s*[-*+]\s+/gm, ""); - text = text.replace(/^\s*\d+\.\s+/gm, ""); - return normalizeWhitespace(text); -} - -function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } { - if (value.length <= maxChars) return { text: value, truncated: false }; - return { text: value.slice(0, maxChars), truncated: true }; -} - -function resolveSiteName(url: string | undefined): string | undefined { - if (!url) return undefined; - try { - return new URL(url).hostname; - } catch { - return undefined; - } -} - -async function readResponseText(res: Response): Promise { - try { - return await res.text(); - } catch { - return ""; - } -} - -export async function extractReadableContent(params: { - html: string; - url: string; - extractMode: (typeof EXTRACT_MODES)[number]; -}): Promise<{ text: string; title?: string } | null> { - try { - const [{ Readability }, { parseHTML }] = await Promise.all([ - import("@mozilla/readability"), - import("linkedom"), - ]); - const { document } = parseHTML(params.html); - try { - (document as { baseURI?: string }).baseURI = params.url; - } catch { - // Best-effort base URI for relative links. - } - const reader = new Readability(document, { charThreshold: 0 }); - const parsed = reader.parse(); - if (!parsed?.content) return null; - const title = parsed.title || undefined; - if (params.extractMode === "text") { - const text = normalizeWhitespace(parsed.textContent ?? ""); - return { text, title }; - } - const rendered = htmlToMarkdown(parsed.content); - return { text: rendered.text, title: title ?? rendered.title }; - } catch { - return null; - } -} - -export async function fetchFirecrawlContent(params: { - url: string; - extractMode: (typeof EXTRACT_MODES)[number]; - apiKey: string; - baseUrl: string; - onlyMainContent: boolean; - maxAgeMs: number; - proxy: "auto" | "basic" | "stealth"; - storeInCache: boolean; - timeoutSeconds: number; -}): Promise<{ - text: string; - title?: string; - finalUrl?: string; - status?: number; - warning?: string; -}> { - const endpoint = resolveFirecrawlEndpoint(params.baseUrl); - const body: Record = { - url: params.url, - formats: ["markdown"], - onlyMainContent: params.onlyMainContent, - timeout: params.timeoutSeconds * 1000, - maxAge: params.maxAgeMs, - proxy: params.proxy, - storeInCache: params.storeInCache, - }; - - const res = await fetch(endpoint, { - method: "POST", - headers: { - Authorization: `Bearer ${params.apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - signal: withTimeout(undefined, params.timeoutSeconds * 1000), - }); - - const payload = (await res.json()) as { - success?: boolean; - data?: { - markdown?: string; - content?: string; - metadata?: { - title?: string; - sourceURL?: string; - statusCode?: number; - }; - }; - warning?: string; - error?: string; - }; - - if (!res.ok || payload?.success === false) { - const detail = payload?.error || res.statusText; - throw new Error(`Firecrawl fetch failed (${res.status}): ${detail}`.trim()); - } - - const data = payload?.data ?? {}; - const rawText = - typeof data.markdown === "string" - ? data.markdown - : typeof data.content === "string" - ? data.content - : ""; - const text = params.extractMode === "text" ? markdownToText(rawText) : rawText; - return { - text, - title: data.metadata?.title, - finalUrl: data.metadata?.sourceURL, - status: data.metadata?.statusCode, - warning: payload?.warning, - }; -} - -type PerplexitySearchResponse = { - choices?: Array<{ - message?: { - content?: string; - }; - }>; - citations?: string[]; -}; - -async function runPerplexitySearch(params: { - query: string; - apiKey: string; - baseUrl: string; - model: string; - timeoutSeconds: number; -}): Promise<{ content: string; citations: string[] }> { - const endpoint = `${params.baseUrl.replace(/\/$/, "")}/chat/completions`; - - const res = await fetch(endpoint, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${params.apiKey}`, - "HTTP-Referer": "https://clawdbot.com", - "X-Title": "Clawdbot Web Search", - }, - body: JSON.stringify({ - model: params.model, - messages: [ - { - role: "user", - content: params.query, - }, - ], - }), - signal: withTimeout(undefined, params.timeoutSeconds * 1000), - }); - - if (!res.ok) { - const detail = await readResponseText(res); - throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`); - } - - const data = (await res.json()) as PerplexitySearchResponse; - const content = data.choices?.[0]?.message?.content ?? "No response"; - const citations = data.citations ?? []; - - return { content, citations }; -} - -async function runWebSearch(params: { - query: string; - count: number; - apiKey: string; - timeoutSeconds: number; - cacheTtlMs: number; - provider: (typeof SEARCH_PROVIDERS)[number]; - country?: string; - search_lang?: string; - ui_lang?: string; - perplexityBaseUrl?: string; - perplexityModel?: string; -}): Promise> { - const cacheKey = normalizeCacheKey( - `${params.provider}:${params.query}:${params.count}:${params.country || "default"}:${params.search_lang || "default"}:${params.ui_lang || "default"}`, - ); - const cached = readCache(SEARCH_CACHE, cacheKey); - if (cached) return { ...cached.value, cached: true }; - - const start = Date.now(); - - if (params.provider === "perplexity") { - const { content, citations } = await runPerplexitySearch({ - query: params.query, - apiKey: params.apiKey, - baseUrl: params.perplexityBaseUrl ?? DEFAULT_PERPLEXITY_BASE_URL, - model: params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL, - timeoutSeconds: params.timeoutSeconds, - }); - - const payload = { - query: params.query, - provider: params.provider, - model: params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL, - tookMs: Date.now() - start, - content, - citations, - }; - writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); - return payload; - } - - if (params.provider !== "brave") { - throw new Error("Unsupported web search provider."); - } - - const url = new URL(BRAVE_SEARCH_ENDPOINT); - url.searchParams.set("q", params.query); - url.searchParams.set("count", String(params.count)); - if (params.country) { - url.searchParams.set("country", params.country); - } - if (params.search_lang) { - url.searchParams.set("search_lang", params.search_lang); - } - if (params.ui_lang) { - url.searchParams.set("ui_lang", params.ui_lang); - } - - const res = await fetch(url.toString(), { - method: "GET", - headers: { - Accept: "application/json", - "X-Subscription-Token": params.apiKey, - }, - signal: withTimeout(undefined, params.timeoutSeconds * 1000), - }); - - if (!res.ok) { - const detail = await readResponseText(res); - throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`); - } - - const data = (await res.json()) as BraveSearchResponse; - const results = Array.isArray(data.web?.results) ? (data.web?.results ?? []) : []; - const mapped = results.map((entry) => ({ - title: entry.title ?? "", - url: entry.url ?? "", - description: entry.description ?? "", - published: entry.age ?? undefined, - siteName: resolveSiteName(entry.url ?? ""), - })); - - const payload = { - query: params.query, - provider: params.provider, - count: mapped.length, - tookMs: Date.now() - start, - results: mapped, - }; - writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); - return payload; -} - -async function runWebFetch(params: { - url: string; - extractMode: (typeof EXTRACT_MODES)[number]; - maxChars: number; - timeoutSeconds: number; - cacheTtlMs: number; - userAgent: string; - readabilityEnabled: boolean; - firecrawlEnabled: boolean; - firecrawlApiKey?: string; - firecrawlBaseUrl: string; - firecrawlOnlyMainContent: boolean; - firecrawlMaxAgeMs: number; - firecrawlProxy: "auto" | "basic" | "stealth"; - firecrawlStoreInCache: boolean; - firecrawlTimeoutSeconds: number; -}): Promise> { - const cacheKey = normalizeCacheKey( - `fetch:${params.url}:${params.extractMode}:${params.maxChars}`, - ); - const cached = readCache(FETCH_CACHE, cacheKey); - if (cached) return { ...cached.value, cached: true }; - - let parsedUrl: URL; - try { - parsedUrl = new URL(params.url); - } catch { - throw new Error("Invalid URL: must be http or https"); - } - if (!["http:", "https:"].includes(parsedUrl.protocol)) { - throw new Error("Invalid URL: must be http or https"); - } - - const start = Date.now(); - let res: Response; - try { - res = await fetch(parsedUrl.toString(), { - method: "GET", - headers: { - Accept: "*/*", - "User-Agent": params.userAgent, - "Accept-Language": "en-US,en;q=0.9", - }, - signal: withTimeout(undefined, params.timeoutSeconds * 1000), - }); - } catch (error) { - if (params.firecrawlEnabled && params.firecrawlApiKey) { - const firecrawl = await fetchFirecrawlContent({ - url: params.url, - extractMode: params.extractMode, - apiKey: params.firecrawlApiKey, - baseUrl: params.firecrawlBaseUrl, - onlyMainContent: params.firecrawlOnlyMainContent, - maxAgeMs: params.firecrawlMaxAgeMs, - proxy: params.firecrawlProxy, - storeInCache: params.firecrawlStoreInCache, - timeoutSeconds: params.firecrawlTimeoutSeconds, - }); - const truncated = truncateText(firecrawl.text, params.maxChars); - const payload = { - url: params.url, - finalUrl: firecrawl.finalUrl || params.url, - status: firecrawl.status ?? 200, - contentType: "text/markdown", - title: firecrawl.title, - extractMode: params.extractMode, - extractor: "firecrawl", - truncated: truncated.truncated, - length: truncated.text.length, - fetchedAt: new Date().toISOString(), - tookMs: Date.now() - start, - text: truncated.text, - warning: firecrawl.warning, - }; - writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); - return payload; - } - throw error; - } - - if (!res.ok) { - if (params.firecrawlEnabled && params.firecrawlApiKey) { - const firecrawl = await fetchFirecrawlContent({ - url: params.url, - extractMode: params.extractMode, - apiKey: params.firecrawlApiKey, - baseUrl: params.firecrawlBaseUrl, - onlyMainContent: params.firecrawlOnlyMainContent, - maxAgeMs: params.firecrawlMaxAgeMs, - proxy: params.firecrawlProxy, - storeInCache: params.firecrawlStoreInCache, - timeoutSeconds: params.firecrawlTimeoutSeconds, - }); - const truncated = truncateText(firecrawl.text, params.maxChars); - const payload = { - url: params.url, - finalUrl: firecrawl.finalUrl || params.url, - status: firecrawl.status ?? res.status, - contentType: "text/markdown", - title: firecrawl.title, - extractMode: params.extractMode, - extractor: "firecrawl", - truncated: truncated.truncated, - length: truncated.text.length, - fetchedAt: new Date().toISOString(), - tookMs: Date.now() - start, - text: truncated.text, - warning: firecrawl.warning, - }; - writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); - return payload; - } - const detail = await readResponseText(res); - throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); - } - - const contentType = res.headers.get("content-type") ?? "application/octet-stream"; - const body = await readResponseText(res); - - let title: string | undefined; - let extractor = "raw"; - let text = body; - if (contentType.includes("text/html")) { - if (params.readabilityEnabled) { - const readable = await extractReadableContent({ - html: body, - url: res.url || params.url, - extractMode: params.extractMode, - }); - if (readable?.text) { - text = readable.text; - title = readable.title; - extractor = "readability"; - } else { - const firecrawl = await tryFirecrawlFallback(params); - if (firecrawl) { - text = firecrawl.text; - title = firecrawl.title; - extractor = "firecrawl"; - } else { - throw new Error( - "Web fetch extraction failed: Readability and Firecrawl returned no content.", - ); - } - } - } else { - throw new Error( - "Web fetch extraction failed: Readability disabled and Firecrawl unavailable.", - ); - } - } else if (contentType.includes("application/json")) { - try { - text = JSON.stringify(JSON.parse(body), null, 2); - extractor = "json"; - } catch { - text = body; - extractor = "raw"; - } - } - - const truncated = truncateText(text, params.maxChars); - const payload = { - url: params.url, - finalUrl: res.url || params.url, - status: res.status, - contentType, - title, - extractMode: params.extractMode, - extractor, - truncated: truncated.truncated, - length: truncated.text.length, - fetchedAt: new Date().toISOString(), - tookMs: Date.now() - start, - text: truncated.text, - }; - writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); - return payload; -} - -async function tryFirecrawlFallback(params: { - url: string; - extractMode: (typeof EXTRACT_MODES)[number]; - firecrawlEnabled: boolean; - firecrawlApiKey?: string; - firecrawlBaseUrl: string; - firecrawlOnlyMainContent: boolean; - firecrawlMaxAgeMs: number; - firecrawlProxy: "auto" | "basic" | "stealth"; - firecrawlStoreInCache: boolean; - firecrawlTimeoutSeconds: number; -}): Promise<{ text: string; title?: string } | null> { - if (!params.firecrawlEnabled || !params.firecrawlApiKey) return null; - try { - const firecrawl = await fetchFirecrawlContent({ - url: params.url, - extractMode: params.extractMode, - apiKey: params.firecrawlApiKey, - baseUrl: params.firecrawlBaseUrl, - onlyMainContent: params.firecrawlOnlyMainContent, - maxAgeMs: params.firecrawlMaxAgeMs, - proxy: params.firecrawlProxy, - storeInCache: params.firecrawlStoreInCache, - timeoutSeconds: params.firecrawlTimeoutSeconds, - }); - return { text: firecrawl.text, title: firecrawl.title }; - } catch { - return null; - } -} - -export function createWebSearchTool(options?: { - config?: ClawdbotConfig; - sandboxed?: boolean; -}): AnyAgentTool | null { - const search = resolveSearchConfig(options?.config); - if (!resolveSearchEnabled({ search, sandboxed: options?.sandboxed })) return null; - - const provider = resolveSearchProvider(search); - const perplexityConfig = resolvePerplexityConfig(search); - - // Determine description based on provider - const description = - provider === "perplexity" - ? "Search the web using Perplexity Sonar (via OpenRouter). Returns AI-synthesized answers with citations from real-time web search." - : "Search the web using Brave Search API. Supports region-specific and localized search via country and language parameters. Returns titles, URLs, and snippets for fast research."; - - return { - label: "Web Search", - name: "web_search", - description, - parameters: WebSearchSchema, - execute: async (_toolCallId, args) => { - // Resolve API key based on provider - const apiKey = - provider === "perplexity" - ? resolvePerplexityApiKey(perplexityConfig) - : resolveSearchApiKey(search); - - if (!apiKey) { - return jsonResult(missingSearchKeyPayload(provider)); - } - const params = args as Record; - const query = readStringParam(params, "query", { required: true }); - const count = - readNumberParam(params, "count", { integer: true }) ?? search?.maxResults ?? undefined; - const country = readStringParam(params, "country"); - const search_lang = readStringParam(params, "search_lang"); - const ui_lang = readStringParam(params, "ui_lang"); - const result = await runWebSearch({ - query, - count: resolveSearchCount(count, DEFAULT_SEARCH_COUNT), - apiKey, - timeoutSeconds: resolveTimeoutSeconds(search?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), - cacheTtlMs: resolveCacheTtlMs(search?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), - provider, - country, - search_lang, - ui_lang, - perplexityBaseUrl: resolvePerplexityBaseUrl(perplexityConfig), - perplexityModel: resolvePerplexityModel(perplexityConfig), - }); - return jsonResult(result); - }, - }; -} - -function resolveFirecrawlEndpoint(baseUrl: string): string { - const trimmed = baseUrl.trim(); - if (!trimmed) return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`; - try { - const url = new URL(trimmed); - if (url.pathname && url.pathname !== "/") { - return url.toString(); - } - url.pathname = "/v2/scrape"; - return url.toString(); - } catch { - return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`; - } -} - -export function createWebFetchTool(options?: { - config?: ClawdbotConfig; - sandboxed?: boolean; -}): AnyAgentTool | null { - const fetch = resolveFetchConfig(options?.config); - if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null; - const readabilityEnabled = resolveFetchReadabilityEnabled(fetch); - const firecrawl = resolveFirecrawlConfig(fetch); - const firecrawlApiKey = resolveFirecrawlApiKey(firecrawl); - const firecrawlEnabled = resolveFirecrawlEnabled({ firecrawl, apiKey: firecrawlApiKey }); - const firecrawlBaseUrl = resolveFirecrawlBaseUrl(firecrawl); - const firecrawlOnlyMainContent = resolveFirecrawlOnlyMainContent(firecrawl); - const firecrawlMaxAgeMs = resolveFirecrawlMaxAgeMsOrDefault(firecrawl); - const firecrawlTimeoutSeconds = resolveTimeoutSeconds( - firecrawl?.timeoutSeconds ?? fetch?.timeoutSeconds, - DEFAULT_TIMEOUT_SECONDS, - ); - const userAgent = - (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || - DEFAULT_FETCH_USER_AGENT; - return { - label: "Web Fetch", - name: "web_fetch", - description: - "Fetch and extract readable content from a URL (HTML → markdown/text). Use for lightweight page access without browser automation.", - parameters: WebFetchSchema, - execute: async (_toolCallId, args) => { - const params = args as Record; - const url = readStringParam(params, "url", { required: true }); - const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown"; - const maxChars = readNumberParam(params, "maxChars", { integer: true }); - const result = await runWebFetch({ - url, - extractMode, - maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS), - timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), - cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), - userAgent, - readabilityEnabled, - firecrawlEnabled, - firecrawlApiKey, - firecrawlBaseUrl, - firecrawlOnlyMainContent, - firecrawlMaxAgeMs, - firecrawlProxy: "auto", - firecrawlStoreInCache: true, - firecrawlTimeoutSeconds, - }); - return jsonResult(result); - }, - }; -} +export { createWebFetchTool, extractReadableContent, fetchFirecrawlContent } from "./web-fetch.js"; +export { createWebSearchTool } from "./web-search.js";