fix: add readability fallback extraction
This commit is contained in:
@@ -81,6 +81,14 @@ export async function extractReadableContent(params: {
|
|||||||
url: string;
|
url: string;
|
||||||
extractMode: ExtractMode;
|
extractMode: ExtractMode;
|
||||||
}): Promise<{ text: string; title?: string } | null> {
|
}): Promise<{ text: string; title?: string } | null> {
|
||||||
|
const fallback = (): { text: string; title?: string } => {
|
||||||
|
const rendered = htmlToMarkdown(params.html);
|
||||||
|
if (params.extractMode === "text") {
|
||||||
|
const text = markdownToText(rendered.text) || normalizeWhitespace(stripTags(params.html));
|
||||||
|
return { text, title: rendered.title };
|
||||||
|
}
|
||||||
|
return rendered;
|
||||||
|
};
|
||||||
try {
|
try {
|
||||||
const [{ Readability }, { parseHTML }] = await Promise.all([
|
const [{ Readability }, { parseHTML }] = await Promise.all([
|
||||||
import("@mozilla/readability"),
|
import("@mozilla/readability"),
|
||||||
@@ -94,15 +102,15 @@ export async function extractReadableContent(params: {
|
|||||||
}
|
}
|
||||||
const reader = new Readability(document, { charThreshold: 0 });
|
const reader = new Readability(document, { charThreshold: 0 });
|
||||||
const parsed = reader.parse();
|
const parsed = reader.parse();
|
||||||
if (!parsed?.content) return null;
|
if (!parsed?.content) return fallback();
|
||||||
const title = parsed.title || undefined;
|
const title = parsed.title || undefined;
|
||||||
if (params.extractMode === "text") {
|
if (params.extractMode === "text") {
|
||||||
const text = normalizeWhitespace(parsed.textContent ?? "");
|
const text = normalizeWhitespace(parsed.textContent ?? "");
|
||||||
return { text, title };
|
return text ? { text, title } : fallback();
|
||||||
}
|
}
|
||||||
const rendered = htmlToMarkdown(parsed.content);
|
const rendered = htmlToMarkdown(parsed.content);
|
||||||
return { text: rendered.text, title: title ?? rendered.title };
|
return { text: rendered.text, title: title ?? rendered.title };
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return fallback();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user