fix: add readability fallback extraction

This commit is contained in:
Peter Steinberger
2026-01-24 02:14:59 +00:00
parent 0840029982
commit 1d862cf5c2

View File

@@ -81,6 +81,14 @@ export async function extractReadableContent(params: {
url: string; url: string;
extractMode: ExtractMode; extractMode: ExtractMode;
}): Promise<{ text: string; title?: string } | null> { }): Promise<{ text: string; title?: string } | null> {
const fallback = (): { text: string; title?: string } => {
const rendered = htmlToMarkdown(params.html);
if (params.extractMode === "text") {
const text = markdownToText(rendered.text) || normalizeWhitespace(stripTags(params.html));
return { text, title: rendered.title };
}
return rendered;
};
try { try {
const [{ Readability }, { parseHTML }] = await Promise.all([ const [{ Readability }, { parseHTML }] = await Promise.all([
import("@mozilla/readability"), import("@mozilla/readability"),
@@ -94,15 +102,15 @@ export async function extractReadableContent(params: {
} }
const reader = new Readability(document, { charThreshold: 0 }); const reader = new Readability(document, { charThreshold: 0 });
const parsed = reader.parse(); const parsed = reader.parse();
if (!parsed?.content) return null; if (!parsed?.content) return fallback();
const title = parsed.title || undefined; const title = parsed.title || undefined;
if (params.extractMode === "text") { if (params.extractMode === "text") {
const text = normalizeWhitespace(parsed.textContent ?? ""); const text = normalizeWhitespace(parsed.textContent ?? "");
return { text, title }; return text ? { text, title } : fallback();
} }
const rendered = htmlToMarkdown(parsed.content); const rendered = htmlToMarkdown(parsed.content);
return { text: rendered.text, title: title ?? rendered.title }; return { text: rendered.text, title: title ?? rendered.title };
} catch { } catch {
return null; return fallback();
} }
} }