fix: preserve markdown fences when chunking

This commit is contained in:
Peter Steinberger
2026-01-06 20:19:56 +01:00
parent afc42c7547
commit 67bda21811
13 changed files with 378 additions and 108 deletions

View File

@@ -1,17 +1,15 @@
import {
findFenceSpanAt,
isSafeFenceBreak,
parseFenceSpans,
} from "../markdown/fences.js";
export type BlockReplyChunking = {
minChars: number;
maxChars: number;
breakPreference?: "paragraph" | "newline" | "sentence";
};
type FenceSpan = {
start: number;
end: number;
openLine: string;
marker: string;
indent: string;
};
type FenceSplit = {
closeFenceLine: string;
reopenFenceLine: string;
@@ -123,7 +121,10 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph") {
let paragraphIdx = buffer.indexOf("\n\n");
while (paragraphIdx !== -1) {
if (paragraphIdx >= minChars && isSafeBreak(fenceSpans, paragraphIdx)) {
if (
paragraphIdx >= minChars &&
isSafeFenceBreak(fenceSpans, paragraphIdx)
) {
return { index: paragraphIdx };
}
paragraphIdx = buffer.indexOf("\n\n", paragraphIdx + 2);
@@ -133,7 +134,10 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph" || preference === "newline") {
let newlineIdx = buffer.indexOf("\n");
while (newlineIdx !== -1) {
if (newlineIdx >= minChars && isSafeBreak(fenceSpans, newlineIdx)) {
if (
newlineIdx >= minChars &&
isSafeFenceBreak(fenceSpans, newlineIdx)
) {
return { index: newlineIdx };
}
newlineIdx = buffer.indexOf("\n", newlineIdx + 1);
@@ -147,7 +151,7 @@ export class EmbeddedBlockChunker {
const at = match.index ?? -1;
if (at < minChars) continue;
const candidate = at + 1;
if (isSafeBreak(fenceSpans, candidate)) {
if (isSafeFenceBreak(fenceSpans, candidate)) {
sentenceIdx = candidate;
}
}
@@ -168,7 +172,7 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph") {
let paragraphIdx = window.lastIndexOf("\n\n");
while (paragraphIdx >= minChars) {
if (isSafeBreak(fenceSpans, paragraphIdx)) {
if (isSafeFenceBreak(fenceSpans, paragraphIdx)) {
return { index: paragraphIdx };
}
paragraphIdx = window.lastIndexOf("\n\n", paragraphIdx - 1);
@@ -178,7 +182,7 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph" || preference === "newline") {
let newlineIdx = window.lastIndexOf("\n");
while (newlineIdx >= minChars) {
if (isSafeBreak(fenceSpans, newlineIdx)) {
if (isSafeFenceBreak(fenceSpans, newlineIdx)) {
return { index: newlineIdx };
}
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
@@ -192,7 +196,7 @@ export class EmbeddedBlockChunker {
const at = match.index ?? -1;
if (at < minChars) continue;
const candidate = at + 1;
if (isSafeBreak(fenceSpans, candidate)) {
if (isSafeFenceBreak(fenceSpans, candidate)) {
sentenceIdx = candidate;
}
}
@@ -200,13 +204,13 @@ export class EmbeddedBlockChunker {
}
for (let i = window.length - 1; i >= minChars; i--) {
if (/\s/.test(window[i]) && isSafeBreak(fenceSpans, i)) {
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
return { index: i };
}
}
if (buffer.length >= maxChars) {
if (isSafeBreak(fenceSpans, maxChars)) return { index: maxChars };
if (isSafeFenceBreak(fenceSpans, maxChars)) return { index: maxChars };
const fence = findFenceSpanAt(fenceSpans, maxChars);
if (fence) {
return {
@@ -229,76 +233,3 @@ function stripLeadingNewlines(value: string): string {
while (i < value.length && value[i] === "\n") i++;
return i > 0 ? value.slice(i) : value;
}
function parseFenceSpans(buffer: string): FenceSpan[] {
const spans: FenceSpan[] = [];
let open:
| {
start: number;
markerChar: string;
markerLen: number;
openLine: string;
marker: string;
indent: string;
}
| undefined;
let offset = 0;
while (offset <= buffer.length) {
const nextNewline = buffer.indexOf("\n", offset);
const lineEnd = nextNewline === -1 ? buffer.length : nextNewline;
const line = buffer.slice(offset, lineEnd);
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
if (match) {
const indent = match[1];
const marker = match[2];
const markerChar = marker[0];
const markerLen = marker.length;
if (!open) {
open = {
start: offset,
markerChar,
markerLen,
openLine: line,
marker,
indent,
};
} else if (
open.markerChar === markerChar &&
markerLen >= open.markerLen
) {
const end = nextNewline === -1 ? buffer.length : nextNewline + 1;
spans.push({
start: open.start,
end,
openLine: open.openLine,
marker: open.marker,
indent: open.indent,
});
open = undefined;
}
}
if (nextNewline === -1) break;
offset = nextNewline + 1;
}
if (open) {
spans.push({
start: open.start,
end: buffer.length,
openLine: open.openLine,
marker: open.marker,
indent: open.indent,
});
}
return spans;
}
function findFenceSpanAt(
spans: FenceSpan[],
index: number,
): FenceSpan | undefined {
return spans.find((span) => index > span.start && index < span.end);
}
function isSafeBreak(spans: FenceSpan[], index: number): boolean {
return !findFenceSpanAt(spans, index);
}