Web: trim HTML error bodies in web_fetch (#1193)

* Web: trim HTML error bodies in web_fetch

* fix: trim web_fetch HTML error bodies (#1193) (thanks @sebslight)

---------

Co-authored-by: Sebastian Slight <sbarrios93@gmail.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Seb Slight
2026-01-18 19:24:16 -05:00
committed by GitHub
parent 15311c138a
commit 2f6b5ffdfe
6 changed files with 127 additions and 11 deletions

View File

@@ -25,7 +25,7 @@ function normalizeWhitespace(value: string): string {
.trim();
}
function htmlToMarkdown(html: string): { text: string; title?: string } {
export function htmlToMarkdown(html: string): { text: string; title?: string } {
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined;
let text = html

View File

@@ -18,6 +18,7 @@ import {
} from "./web-shared.js";
import {
extractReadableContent,
htmlToMarkdown,
markdownToText,
truncateText,
type ExtractMode,
@@ -28,6 +29,7 @@ export { extractReadableContent } from "./web-fetch-utils.js";
const EXTRACT_MODES = ["markdown", "text"] as const;
const DEFAULT_FETCH_MAX_CHARS = 50_000;
const DEFAULT_ERROR_MAX_CHARS = 4_000;
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
const DEFAULT_FETCH_USER_AGENT =
@@ -142,6 +144,30 @@ function resolveMaxChars(value: unknown, fallback: number): number {
return Math.max(100, Math.floor(parsed));
}
function looksLikeHtml(value: string): boolean {
const trimmed = value.trimStart();
if (!trimmed) return false;
const head = trimmed.slice(0, 256).toLowerCase();
return head.startsWith("<!doctype html") || head.startsWith("<html");
}
function formatWebFetchErrorDetail(params: {
detail: string;
contentType?: string | null;
maxChars: number;
}): string {
const { detail, contentType, maxChars } = params;
if (!detail) return "";
let text = detail;
const contentTypeLower = contentType?.toLowerCase();
if (contentTypeLower?.includes("text/html") || looksLikeHtml(detail)) {
const rendered = htmlToMarkdown(detail);
const withTitle = rendered.title ? `${rendered.title}\n${rendered.text}` : rendered.text;
text = markdownToText(withTitle);
}
const truncated = truncateText(text.trim(), maxChars);
return truncated.text;
}
export async function fetchFirecrawlContent(params: {
url: string;
extractMode: ExtractMode;
@@ -329,7 +355,12 @@ async function runWebFetch(params: {
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
}
const detail = await readResponseText(res);
const rawDetail = await readResponseText(res);
const detail = formatWebFetchErrorDetail({
detail: rawDetail,
contentType: res.headers.get("content-type"),
maxChars: DEFAULT_ERROR_MAX_CHARS,
});
throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
}

View File

@@ -49,6 +49,20 @@ function firecrawlError(): MockResponse {
};
}
function errorHtmlResponse(
html: string,
status = 404,
url = "https://example.com/",
contentType: string | null = "text/html; charset=utf-8",
): MockResponse {
return {
ok: false,
status,
url,
headers: contentType ? makeHeaders({ "content-type": contentType }) : makeHeaders({}),
text: async () => html,
};
}
function requestUrl(input: RequestInfo): string {
if (typeof input === "string") return input;
if (input instanceof URL) return input.toString();
@@ -182,4 +196,64 @@ describe("web_fetch extraction fallbacks", () => {
expect(details.extractor).toBe("firecrawl");
expect(details.text).toContain("firecrawl fallback");
});
it("strips and truncates HTML from error responses", async () => {
const long = "x".repeat(12_000);
const html =
"<!doctype html><html><head><title>Not Found</title></head><body><h1>Not Found</h1><p>" +
long +
"</p></body></html>";
const mockFetch = vi.fn((input: RequestInfo) =>
Promise.resolve(errorHtmlResponse(html, 404, requestUrl(input), "Text/HTML; charset=utf-8")),
);
// @ts-expect-error mock fetch
global.fetch = mockFetch;
const tool = createWebFetchTool({
config: {
tools: {
web: {
fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } },
},
},
},
sandboxed: false,
});
let message = "";
try {
await tool?.execute?.("call", { url: "https://example.com/missing" });
} catch (error) {
message = (error as Error).message;
}
expect(message).toContain("Web fetch failed (404):");
expect(message).toContain("Not Found");
expect(message).not.toContain("<html");
expect(message.length).toBeLessThan(5_000);
});
it("strips HTML errors when content-type is missing", async () => {
const html =
"<!DOCTYPE HTML><html><head><title>Oops</title></head><body><h1>Oops</h1></body></html>";
const mockFetch = vi.fn((input: RequestInfo) =>
Promise.resolve(errorHtmlResponse(html, 500, requestUrl(input), null)),
);
// @ts-expect-error mock fetch
global.fetch = mockFetch;
const tool = createWebFetchTool({
config: {
tools: {
web: {
fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } },
},
},
},
sandboxed: false,
});
await expect(tool?.execute?.("call", { url: "https://example.com/oops" })).rejects.toThrow(
/Web fetch failed \(500\):.*Oops/,
);
});
});