Files
clawdbot/src/markdown/ir.ts

797 lines
21 KiB
TypeScript

import MarkdownIt from "markdown-it";
import { chunkText } from "../auto-reply/chunk.js";
import type { MarkdownTableMode } from "../config/types.base.js";
type ListState = {
type: "bullet" | "ordered";
index: number;
};
type LinkState = {
href: string;
labelStart: number;
};
type RenderEnv = {
listStack: ListState[];
};
type MarkdownToken = {
type: string;
content?: string;
children?: MarkdownToken[];
attrs?: [string, string][];
attrGet?: (name: string) => string | null;
};
export type MarkdownStyle = "bold" | "italic" | "strikethrough" | "code" | "code_block" | "spoiler";
export type MarkdownStyleSpan = {
start: number;
end: number;
style: MarkdownStyle;
};
export type MarkdownLinkSpan = {
start: number;
end: number;
href: string;
};
export type MarkdownIR = {
text: string;
styles: MarkdownStyleSpan[];
links: MarkdownLinkSpan[];
};
type OpenStyle = {
style: MarkdownStyle;
start: number;
};
type RenderTarget = {
text: string;
styles: MarkdownStyleSpan[];
openStyles: OpenStyle[];
links: MarkdownLinkSpan[];
linkStack: LinkState[];
};
type TableCell = {
text: string;
styles: MarkdownStyleSpan[];
links: MarkdownLinkSpan[];
};
type TableState = {
headers: TableCell[];
rows: TableCell[][];
currentRow: TableCell[];
currentCell: RenderTarget | null;
inHeader: boolean;
};
type RenderState = RenderTarget & {
env: RenderEnv;
headingStyle: "none" | "bold";
blockquotePrefix: string;
enableSpoilers: boolean;
tableMode: MarkdownTableMode;
table: TableState | null;
hasTables: boolean;
};
export type MarkdownParseOptions = {
linkify?: boolean;
enableSpoilers?: boolean;
headingStyle?: "none" | "bold";
blockquotePrefix?: string;
autolink?: boolean;
/** How to render tables (off|bullets|code). Default: off. */
tableMode?: MarkdownTableMode;
};
function createMarkdownIt(options: MarkdownParseOptions): MarkdownIt {
const md = new MarkdownIt({
html: false,
linkify: options.linkify ?? true,
breaks: false,
typographer: false,
});
md.enable("strikethrough");
if (options.tableMode && options.tableMode !== "off") {
md.enable("table");
} else {
md.disable("table");
}
if (options.autolink === false) {
md.disable("autolink");
}
return md;
}
function getAttr(token: MarkdownToken, name: string): string | null {
if (token.attrGet) return token.attrGet(name);
if (token.attrs) {
for (const [key, value] of token.attrs) {
if (key === name) return value;
}
}
return null;
}
function createTextToken(base: MarkdownToken, content: string): MarkdownToken {
return { ...base, type: "text", content, children: undefined };
}
function applySpoilerTokens(tokens: MarkdownToken[]): void {
for (const token of tokens) {
if (token.children && token.children.length > 0) {
token.children = injectSpoilersIntoInline(token.children);
}
}
}
function injectSpoilersIntoInline(tokens: MarkdownToken[]): MarkdownToken[] {
const result: MarkdownToken[] = [];
const state = { spoilerOpen: false };
for (const token of tokens) {
if (token.type !== "text") {
result.push(token);
continue;
}
const content = token.content ?? "";
if (!content.includes("||")) {
result.push(token);
continue;
}
let index = 0;
while (index < content.length) {
const next = content.indexOf("||", index);
if (next === -1) {
if (index < content.length) {
result.push(createTextToken(token, content.slice(index)));
}
break;
}
if (next > index) {
result.push(createTextToken(token, content.slice(index, next)));
}
state.spoilerOpen = !state.spoilerOpen;
result.push({
type: state.spoilerOpen ? "spoiler_open" : "spoiler_close",
});
index = next + 2;
}
}
return result;
}
function initRenderTarget(): RenderTarget {
return {
text: "",
styles: [],
openStyles: [],
links: [],
linkStack: [],
};
}
function resolveRenderTarget(state: RenderState): RenderTarget {
return state.table?.currentCell ?? state;
}
function appendText(state: RenderState, value: string) {
if (!value) return;
const target = resolveRenderTarget(state);
target.text += value;
}
function openStyle(state: RenderState, style: MarkdownStyle) {
const target = resolveRenderTarget(state);
target.openStyles.push({ style, start: target.text.length });
}
function closeStyle(state: RenderState, style: MarkdownStyle) {
const target = resolveRenderTarget(state);
for (let i = target.openStyles.length - 1; i >= 0; i -= 1) {
if (target.openStyles[i]?.style === style) {
const start = target.openStyles[i].start;
target.openStyles.splice(i, 1);
const end = target.text.length;
if (end > start) {
target.styles.push({ start, end, style });
}
return;
}
}
}
function appendParagraphSeparator(state: RenderState) {
if (state.env.listStack.length > 0) return;
if (state.table) return; // Don't add paragraph separators inside tables
state.text += "\n\n";
}
function appendListPrefix(state: RenderState) {
const stack = state.env.listStack;
const top = stack[stack.length - 1];
if (!top) return;
top.index += 1;
const indent = " ".repeat(Math.max(0, stack.length - 1));
const prefix = top.type === "ordered" ? `${top.index}. ` : "• ";
state.text += `${indent}${prefix}`;
}
function renderInlineCode(state: RenderState, content: string) {
if (!content) return;
const target = resolveRenderTarget(state);
const start = target.text.length;
target.text += content;
target.styles.push({ start, end: start + content.length, style: "code" });
}
function renderCodeBlock(state: RenderState, content: string) {
let code = content ?? "";
if (!code.endsWith("\n")) code = `${code}\n`;
const target = resolveRenderTarget(state);
const start = target.text.length;
target.text += code;
target.styles.push({ start, end: start + code.length, style: "code_block" });
if (state.env.listStack.length === 0) {
target.text += "\n";
}
}
function handleLinkClose(state: RenderState) {
const target = resolveRenderTarget(state);
const link = target.linkStack.pop();
if (!link?.href) return;
const href = link.href.trim();
if (!href) return;
const start = link.labelStart;
const end = target.text.length;
if (end <= start) {
target.links.push({ start, end, href });
return;
}
target.links.push({ start, end, href });
}
function initTableState(): TableState {
return {
headers: [],
rows: [],
currentRow: [],
currentCell: null,
inHeader: false,
};
}
function finishTableCell(cell: RenderTarget): TableCell {
closeRemainingStyles(cell);
return {
text: cell.text,
styles: cell.styles,
links: cell.links,
};
}
function trimCell(cell: TableCell): TableCell {
const text = cell.text;
let start = 0;
let end = text.length;
while (start < end && /\s/.test(text[start] ?? "")) start += 1;
while (end > start && /\s/.test(text[end - 1] ?? "")) end -= 1;
if (start === 0 && end === text.length) return cell;
const trimmedText = text.slice(start, end);
const trimmedLength = trimmedText.length;
const trimmedStyles: MarkdownStyleSpan[] = [];
for (const span of cell.styles) {
const sliceStart = Math.max(0, span.start - start);
const sliceEnd = Math.min(trimmedLength, span.end - start);
if (sliceEnd > sliceStart) {
trimmedStyles.push({ start: sliceStart, end: sliceEnd, style: span.style });
}
}
const trimmedLinks: MarkdownLinkSpan[] = [];
for (const span of cell.links) {
const sliceStart = Math.max(0, span.start - start);
const sliceEnd = Math.min(trimmedLength, span.end - start);
if (sliceEnd > sliceStart) {
trimmedLinks.push({ start: sliceStart, end: sliceEnd, href: span.href });
}
}
return { text: trimmedText, styles: trimmedStyles, links: trimmedLinks };
}
function appendCell(state: RenderState, cell: TableCell) {
if (!cell.text) return;
const start = state.text.length;
state.text += cell.text;
for (const span of cell.styles) {
state.styles.push({
start: start + span.start,
end: start + span.end,
style: span.style,
});
}
for (const link of cell.links) {
state.links.push({
start: start + link.start,
end: start + link.end,
href: link.href,
});
}
}
function renderTableAsBullets(state: RenderState) {
if (!state.table) return;
const headers = state.table.headers.map(trimCell);
const rows = state.table.rows.map((row) => row.map(trimCell));
// If no headers or rows, skip
if (headers.length === 0 && rows.length === 0) return;
// Determine if first column should be used as row labels
// (common pattern: first column is category/feature name)
const useFirstColAsLabel = headers.length > 1 && rows.length > 0;
if (useFirstColAsLabel) {
// Format: each row becomes a section with header as row[0], then key:value pairs
for (const row of rows) {
if (row.length === 0) continue;
const rowLabel = row[0];
if (rowLabel?.text) {
const labelStart = state.text.length;
appendCell(state, rowLabel);
const labelEnd = state.text.length;
if (labelEnd > labelStart) {
state.styles.push({ start: labelStart, end: labelEnd, style: "bold" });
}
state.text += "\n";
}
// Add each column as a bullet point
for (let i = 1; i < row.length; i++) {
const header = headers[i];
const value = row[i];
if (!value?.text) continue;
state.text += "• ";
if (header?.text) {
appendCell(state, header);
state.text += ": ";
} else {
state.text += `Column ${i}: `;
}
appendCell(state, value);
state.text += "\n";
}
state.text += "\n";
}
} else {
// Simple table: just list headers and values
for (const row of rows) {
for (let i = 0; i < row.length; i++) {
const header = headers[i];
const value = row[i];
if (!value?.text) continue;
state.text += "• ";
if (header?.text) {
appendCell(state, header);
state.text += ": ";
}
appendCell(state, value);
state.text += "\n";
}
state.text += "\n";
}
}
}
function renderTableAsCode(state: RenderState) {
if (!state.table) return;
const headers = state.table.headers.map(trimCell);
const rows = state.table.rows.map((row) => row.map(trimCell));
const columnCount = Math.max(headers.length, ...rows.map((row) => row.length));
if (columnCount === 0) return;
const widths = Array.from({ length: columnCount }, () => 0);
const updateWidths = (cells: TableCell[]) => {
for (let i = 0; i < columnCount; i += 1) {
const cell = cells[i];
const width = cell?.text.length ?? 0;
if (widths[i] < width) widths[i] = width;
}
};
updateWidths(headers);
for (const row of rows) updateWidths(row);
const codeStart = state.text.length;
const appendRow = (cells: TableCell[]) => {
state.text += "|";
for (let i = 0; i < columnCount; i += 1) {
state.text += " ";
const cell = cells[i];
if (cell) appendCell(state, cell);
const pad = widths[i] - (cell?.text.length ?? 0);
if (pad > 0) state.text += " ".repeat(pad);
state.text += " |";
}
state.text += "\n";
};
const appendDivider = () => {
state.text += "|";
for (let i = 0; i < columnCount; i += 1) {
const dashCount = Math.max(3, widths[i]);
state.text += ` ${"-".repeat(dashCount)} |`;
}
state.text += "\n";
};
appendRow(headers);
appendDivider();
for (const row of rows) {
appendRow(row);
}
const codeEnd = state.text.length;
if (codeEnd > codeStart) {
state.styles.push({ start: codeStart, end: codeEnd, style: "code_block" });
}
if (state.env.listStack.length === 0) {
state.text += "\n";
}
}
function renderTokens(tokens: MarkdownToken[], state: RenderState): void {
for (const token of tokens) {
switch (token.type) {
case "inline":
if (token.children) renderTokens(token.children, state);
break;
case "text":
appendText(state, token.content ?? "");
break;
case "em_open":
openStyle(state, "italic");
break;
case "em_close":
closeStyle(state, "italic");
break;
case "strong_open":
openStyle(state, "bold");
break;
case "strong_close":
closeStyle(state, "bold");
break;
case "s_open":
openStyle(state, "strikethrough");
break;
case "s_close":
closeStyle(state, "strikethrough");
break;
case "code_inline":
renderInlineCode(state, token.content ?? "");
break;
case "spoiler_open":
if (state.enableSpoilers) openStyle(state, "spoiler");
break;
case "spoiler_close":
if (state.enableSpoilers) closeStyle(state, "spoiler");
break;
case "link_open": {
const href = getAttr(token, "href") ?? "";
const target = resolveRenderTarget(state);
target.linkStack.push({ href, labelStart: target.text.length });
break;
}
case "link_close":
handleLinkClose(state);
break;
case "image":
appendText(state, token.content ?? "");
break;
case "softbreak":
case "hardbreak":
appendText(state, "\n");
break;
case "paragraph_close":
appendParagraphSeparator(state);
break;
case "heading_open":
if (state.headingStyle === "bold") openStyle(state, "bold");
break;
case "heading_close":
if (state.headingStyle === "bold") closeStyle(state, "bold");
appendParagraphSeparator(state);
break;
case "blockquote_open":
if (state.blockquotePrefix) state.text += state.blockquotePrefix;
break;
case "blockquote_close":
state.text += "\n";
break;
case "bullet_list_open":
state.env.listStack.push({ type: "bullet", index: 0 });
break;
case "bullet_list_close":
state.env.listStack.pop();
break;
case "ordered_list_open": {
const start = Number(getAttr(token, "start") ?? "1");
state.env.listStack.push({ type: "ordered", index: start - 1 });
break;
}
case "ordered_list_close":
state.env.listStack.pop();
break;
case "list_item_open":
appendListPrefix(state);
break;
case "list_item_close":
state.text += "\n";
break;
case "code_block":
case "fence":
renderCodeBlock(state, token.content ?? "");
break;
case "html_block":
case "html_inline":
appendText(state, token.content ?? "");
break;
// Table handling
case "table_open":
if (state.tableMode !== "off") {
state.table = initTableState();
state.hasTables = true;
}
break;
case "table_close":
if (state.table) {
if (state.tableMode === "bullets") {
renderTableAsBullets(state);
} else if (state.tableMode === "code") {
renderTableAsCode(state);
}
}
state.table = null;
break;
case "thead_open":
if (state.table) {
state.table.inHeader = true;
}
break;
case "thead_close":
if (state.table) {
state.table.inHeader = false;
}
break;
case "tbody_open":
case "tbody_close":
break;
case "tr_open":
if (state.table) {
state.table.currentRow = [];
}
break;
case "tr_close":
if (state.table) {
if (state.table.inHeader) {
state.table.headers = state.table.currentRow;
} else {
state.table.rows.push(state.table.currentRow);
}
state.table.currentRow = [];
}
break;
case "th_open":
case "td_open":
if (state.table) {
state.table.currentCell = initRenderTarget();
}
break;
case "th_close":
case "td_close":
if (state.table?.currentCell) {
state.table.currentRow.push(finishTableCell(state.table.currentCell));
state.table.currentCell = null;
}
break;
case "hr":
state.text += "\n";
break;
default:
if (token.children) renderTokens(token.children, state);
break;
}
}
}
function closeRemainingStyles(target: RenderTarget) {
for (let i = target.openStyles.length - 1; i >= 0; i -= 1) {
const open = target.openStyles[i];
const end = target.text.length;
if (end > open.start) {
target.styles.push({
start: open.start,
end,
style: open.style,
});
}
}
target.openStyles = [];
}
function clampStyleSpans(spans: MarkdownStyleSpan[], maxLength: number): MarkdownStyleSpan[] {
const clamped: MarkdownStyleSpan[] = [];
for (const span of spans) {
const start = Math.max(0, Math.min(span.start, maxLength));
const end = Math.max(start, Math.min(span.end, maxLength));
if (end > start) clamped.push({ start, end, style: span.style });
}
return clamped;
}
function clampLinkSpans(spans: MarkdownLinkSpan[], maxLength: number): MarkdownLinkSpan[] {
const clamped: MarkdownLinkSpan[] = [];
for (const span of spans) {
const start = Math.max(0, Math.min(span.start, maxLength));
const end = Math.max(start, Math.min(span.end, maxLength));
if (end > start) clamped.push({ start, end, href: span.href });
}
return clamped;
}
function mergeStyleSpans(spans: MarkdownStyleSpan[]): MarkdownStyleSpan[] {
const sorted = [...spans].sort((a, b) => {
if (a.start !== b.start) return a.start - b.start;
if (a.end !== b.end) return a.end - b.end;
return a.style.localeCompare(b.style);
});
const merged: MarkdownStyleSpan[] = [];
for (const span of sorted) {
const prev = merged[merged.length - 1];
if (prev && prev.style === span.style && span.start <= prev.end) {
prev.end = Math.max(prev.end, span.end);
continue;
}
merged.push({ ...span });
}
return merged;
}
function sliceStyleSpans(
spans: MarkdownStyleSpan[],
start: number,
end: number,
): MarkdownStyleSpan[] {
if (spans.length === 0) return [];
const sliced: MarkdownStyleSpan[] = [];
for (const span of spans) {
const sliceStart = Math.max(span.start, start);
const sliceEnd = Math.min(span.end, end);
if (sliceEnd > sliceStart) {
sliced.push({
start: sliceStart - start,
end: sliceEnd - start,
style: span.style,
});
}
}
return mergeStyleSpans(sliced);
}
function sliceLinkSpans(spans: MarkdownLinkSpan[], start: number, end: number): MarkdownLinkSpan[] {
if (spans.length === 0) return [];
const sliced: MarkdownLinkSpan[] = [];
for (const span of spans) {
const sliceStart = Math.max(span.start, start);
const sliceEnd = Math.min(span.end, end);
if (sliceEnd > sliceStart) {
sliced.push({
start: sliceStart - start,
end: sliceEnd - start,
href: span.href,
});
}
}
return sliced;
}
export function markdownToIR(markdown: string, options: MarkdownParseOptions = {}): MarkdownIR {
return markdownToIRWithMeta(markdown, options).ir;
}
export function markdownToIRWithMeta(
markdown: string,
options: MarkdownParseOptions = {},
): { ir: MarkdownIR; hasTables: boolean } {
const env: RenderEnv = { listStack: [] };
const md = createMarkdownIt(options);
const tokens = md.parse(markdown ?? "", env as unknown as object);
if (options.enableSpoilers) {
applySpoilerTokens(tokens as MarkdownToken[]);
}
const tableMode = options.tableMode ?? "off";
const state: RenderState = {
text: "",
styles: [],
openStyles: [],
links: [],
linkStack: [],
env,
headingStyle: options.headingStyle ?? "none",
blockquotePrefix: options.blockquotePrefix ?? "",
enableSpoilers: options.enableSpoilers ?? false,
tableMode,
table: null,
hasTables: false,
};
renderTokens(tokens as MarkdownToken[], state);
closeRemainingStyles(state);
const trimmedText = state.text.trimEnd();
const trimmedLength = trimmedText.length;
let codeBlockEnd = 0;
for (const span of state.styles) {
if (span.style !== "code_block") continue;
if (span.end > codeBlockEnd) codeBlockEnd = span.end;
}
const finalLength = Math.max(trimmedLength, codeBlockEnd);
const finalText =
finalLength === state.text.length ? state.text : state.text.slice(0, finalLength);
return {
ir: {
text: finalText,
styles: mergeStyleSpans(clampStyleSpans(state.styles, finalLength)),
links: clampLinkSpans(state.links, finalLength),
},
hasTables: state.hasTables,
};
}
export function chunkMarkdownIR(ir: MarkdownIR, limit: number): MarkdownIR[] {
if (!ir.text) return [];
if (limit <= 0 || ir.text.length <= limit) return [ir];
const chunks = chunkText(ir.text, limit);
const results: MarkdownIR[] = [];
let cursor = 0;
chunks.forEach((chunk, index) => {
if (!chunk) return;
if (index > 0) {
while (cursor < ir.text.length && /\s/.test(ir.text[cursor] ?? "")) {
cursor += 1;
}
}
const start = cursor;
const end = Math.min(ir.text.length, start + chunk.length);
results.push({
text: chunk,
styles: sliceStyleSpans(ir.styles, start, end),
links: sliceLinkSpans(ir.links, start, end),
});
cursor = end;
});
return results;
}