Web: trim HTML error bodies in web_fetch (#1193)
* Web: trim HTML error bodies in web_fetch * fix: trim web_fetch HTML error bodies (#1193) (thanks @sebslight) --------- Co-authored-by: Sebastian Slight <sbarrios93@gmail.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -9,6 +9,7 @@ Docs: https://docs.clawd.bot
|
||||
|
||||
### Fixes
|
||||
- Plugins: surface plugin load/register/config errors in gateway logs with plugin/source context.
|
||||
- Web: trim HTML error bodies in web_fetch failures. (#1193) — thanks @sebslight.
|
||||
|
||||
## 2026.1.18-5
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ function normalizeWhitespace(value: string): string {
|
||||
.trim();
|
||||
}
|
||||
|
||||
function htmlToMarkdown(html: string): { text: string; title?: string } {
|
||||
export function htmlToMarkdown(html: string): { text: string; title?: string } {
|
||||
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||
const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined;
|
||||
let text = html
|
||||
|
||||
@@ -18,6 +18,7 @@ import {
|
||||
} from "./web-shared.js";
|
||||
import {
|
||||
extractReadableContent,
|
||||
htmlToMarkdown,
|
||||
markdownToText,
|
||||
truncateText,
|
||||
type ExtractMode,
|
||||
@@ -28,6 +29,7 @@ export { extractReadableContent } from "./web-fetch-utils.js";
|
||||
const EXTRACT_MODES = ["markdown", "text"] as const;
|
||||
|
||||
const DEFAULT_FETCH_MAX_CHARS = 50_000;
|
||||
const DEFAULT_ERROR_MAX_CHARS = 4_000;
|
||||
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
|
||||
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
|
||||
const DEFAULT_FETCH_USER_AGENT =
|
||||
@@ -142,6 +144,30 @@ function resolveMaxChars(value: unknown, fallback: number): number {
|
||||
return Math.max(100, Math.floor(parsed));
|
||||
}
|
||||
|
||||
function looksLikeHtml(value: string): boolean {
|
||||
const trimmed = value.trimStart();
|
||||
if (!trimmed) return false;
|
||||
const head = trimmed.slice(0, 256).toLowerCase();
|
||||
return head.startsWith("<!doctype html") || head.startsWith("<html");
|
||||
}
|
||||
|
||||
function formatWebFetchErrorDetail(params: {
|
||||
detail: string;
|
||||
contentType?: string | null;
|
||||
maxChars: number;
|
||||
}): string {
|
||||
const { detail, contentType, maxChars } = params;
|
||||
if (!detail) return "";
|
||||
let text = detail;
|
||||
const contentTypeLower = contentType?.toLowerCase();
|
||||
if (contentTypeLower?.includes("text/html") || looksLikeHtml(detail)) {
|
||||
const rendered = htmlToMarkdown(detail);
|
||||
const withTitle = rendered.title ? `${rendered.title}\n${rendered.text}` : rendered.text;
|
||||
text = markdownToText(withTitle);
|
||||
}
|
||||
const truncated = truncateText(text.trim(), maxChars);
|
||||
return truncated.text;
|
||||
}
|
||||
export async function fetchFirecrawlContent(params: {
|
||||
url: string;
|
||||
extractMode: ExtractMode;
|
||||
@@ -329,7 +355,12 @@ async function runWebFetch(params: {
|
||||
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
||||
return payload;
|
||||
}
|
||||
const detail = await readResponseText(res);
|
||||
const rawDetail = await readResponseText(res);
|
||||
const detail = formatWebFetchErrorDetail({
|
||||
detail: rawDetail,
|
||||
contentType: res.headers.get("content-type"),
|
||||
maxChars: DEFAULT_ERROR_MAX_CHARS,
|
||||
});
|
||||
throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,20 @@ function firecrawlError(): MockResponse {
|
||||
};
|
||||
}
|
||||
|
||||
function errorHtmlResponse(
|
||||
html: string,
|
||||
status = 404,
|
||||
url = "https://example.com/",
|
||||
contentType: string | null = "text/html; charset=utf-8",
|
||||
): MockResponse {
|
||||
return {
|
||||
ok: false,
|
||||
status,
|
||||
url,
|
||||
headers: contentType ? makeHeaders({ "content-type": contentType }) : makeHeaders({}),
|
||||
text: async () => html,
|
||||
};
|
||||
}
|
||||
function requestUrl(input: RequestInfo): string {
|
||||
if (typeof input === "string") return input;
|
||||
if (input instanceof URL) return input.toString();
|
||||
@@ -182,4 +196,64 @@ describe("web_fetch extraction fallbacks", () => {
|
||||
expect(details.extractor).toBe("firecrawl");
|
||||
expect(details.text).toContain("firecrawl fallback");
|
||||
});
|
||||
it("strips and truncates HTML from error responses", async () => {
|
||||
const long = "x".repeat(12_000);
|
||||
const html =
|
||||
"<!doctype html><html><head><title>Not Found</title></head><body><h1>Not Found</h1><p>" +
|
||||
long +
|
||||
"</p></body></html>";
|
||||
const mockFetch = vi.fn((input: RequestInfo) =>
|
||||
Promise.resolve(errorHtmlResponse(html, 404, requestUrl(input), "Text/HTML; charset=utf-8")),
|
||||
);
|
||||
// @ts-expect-error mock fetch
|
||||
global.fetch = mockFetch;
|
||||
|
||||
const tool = createWebFetchTool({
|
||||
config: {
|
||||
tools: {
|
||||
web: {
|
||||
fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } },
|
||||
},
|
||||
},
|
||||
},
|
||||
sandboxed: false,
|
||||
});
|
||||
|
||||
let message = "";
|
||||
try {
|
||||
await tool?.execute?.("call", { url: "https://example.com/missing" });
|
||||
} catch (error) {
|
||||
message = (error as Error).message;
|
||||
}
|
||||
|
||||
expect(message).toContain("Web fetch failed (404):");
|
||||
expect(message).toContain("Not Found");
|
||||
expect(message).not.toContain("<html");
|
||||
expect(message.length).toBeLessThan(5_000);
|
||||
});
|
||||
|
||||
it("strips HTML errors when content-type is missing", async () => {
|
||||
const html =
|
||||
"<!DOCTYPE HTML><html><head><title>Oops</title></head><body><h1>Oops</h1></body></html>";
|
||||
const mockFetch = vi.fn((input: RequestInfo) =>
|
||||
Promise.resolve(errorHtmlResponse(html, 500, requestUrl(input), null)),
|
||||
);
|
||||
// @ts-expect-error mock fetch
|
||||
global.fetch = mockFetch;
|
||||
|
||||
const tool = createWebFetchTool({
|
||||
config: {
|
||||
tools: {
|
||||
web: {
|
||||
fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } },
|
||||
},
|
||||
},
|
||||
},
|
||||
sandboxed: false,
|
||||
});
|
||||
|
||||
await expect(tool?.execute?.("call", { url: "https://example.com/oops" })).rejects.toThrow(
|
||||
/Web fetch failed \(500\):.*Oops/,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -95,7 +95,7 @@ describe("gateway server cron", () => {
|
||||
const jobId = typeof jobIdValue === "string" ? jobIdValue : "";
|
||||
expect(jobId.length > 0).toBe(true);
|
||||
|
||||
const runRes = await rpcReq(ws, "cron.run", { id: jobId, mode: "force" });
|
||||
const runRes = await rpcReq(ws, "cron.run", { id: jobId, mode: "force" }, 20_000);
|
||||
expect(runRes.ok).toBe(true);
|
||||
|
||||
const events = await waitForSystemEvent();
|
||||
@@ -279,7 +279,8 @@ describe("gateway server cron", () => {
|
||||
const jobId = typeof jobIdValue === "string" ? jobIdValue : "";
|
||||
expect(jobId.length > 0).toBe(true);
|
||||
|
||||
const runRes = await rpcReq(ws, "cron.run", { id: jobId, mode: "force" });
|
||||
// Full-suite runs can starve the event loop; give cron.run extra time to respond.
|
||||
const runRes = await rpcReq(ws, "cron.run", { id: jobId, mode: "force" }, 20_000);
|
||||
expect(runRes.ok).toBe(true);
|
||||
|
||||
const logPath = path.join(dir, "cron", "runs", `${jobId}.jsonl`);
|
||||
@@ -375,7 +376,7 @@ describe("gateway server cron", () => {
|
||||
expect(last.jobId).toBe(jobId);
|
||||
expect(last.summary).toBe("hello");
|
||||
|
||||
const runsRes = await rpcReq(ws, "cron.runs", { id: jobId, limit: 20 });
|
||||
const runsRes = await rpcReq(ws, "cron.runs", { id: jobId, limit: 20 }, 20_000);
|
||||
expect(runsRes.ok).toBe(true);
|
||||
const entries = (runsRes.payload as { entries?: unknown } | null)?.entries;
|
||||
expect(Array.isArray(entries)).toBe(true);
|
||||
|
||||
@@ -291,7 +291,12 @@ export async function connectOk(ws: WebSocket, opts?: Parameters<typeof connectR
|
||||
return res.payload as { type: "hello-ok" };
|
||||
}
|
||||
|
||||
export async function rpcReq<T = unknown>(ws: WebSocket, method: string, params?: unknown) {
|
||||
export async function rpcReq<T = unknown>(
|
||||
ws: WebSocket,
|
||||
method: string,
|
||||
params?: unknown,
|
||||
timeoutMs?: number,
|
||||
) {
|
||||
const { randomUUID } = await import("node:crypto");
|
||||
const id = randomUUID();
|
||||
ws.send(JSON.stringify({ type: "req", id, method, params }));
|
||||
@@ -301,11 +306,15 @@ export async function rpcReq<T = unknown>(ws: WebSocket, method: string, params?
|
||||
ok: boolean;
|
||||
payload?: T;
|
||||
error?: { message?: string; code?: string };
|
||||
}>(ws, (o) => {
|
||||
if (!o || typeof o !== "object" || Array.isArray(o)) return false;
|
||||
const rec = o as Record<string, unknown>;
|
||||
return rec.type === "res" && rec.id === id;
|
||||
});
|
||||
}>(
|
||||
ws,
|
||||
(o) => {
|
||||
if (!o || typeof o !== "object" || Array.isArray(o)) return false;
|
||||
const rec = o as Record<string, unknown>;
|
||||
return rec.type === "res" && rec.id === id;
|
||||
},
|
||||
timeoutMs,
|
||||
);
|
||||
}
|
||||
|
||||
export async function waitForSystemEvent(timeoutMs = 2000) {
|
||||
|
||||
Reference in New Issue
Block a user