fix: preserve markdown fences when chunking
This commit is contained in:
@@ -1,17 +1,15 @@
|
||||
import {
|
||||
findFenceSpanAt,
|
||||
isSafeFenceBreak,
|
||||
parseFenceSpans,
|
||||
} from "../markdown/fences.js";
|
||||
|
||||
export type BlockReplyChunking = {
|
||||
minChars: number;
|
||||
maxChars: number;
|
||||
breakPreference?: "paragraph" | "newline" | "sentence";
|
||||
};
|
||||
|
||||
type FenceSpan = {
|
||||
start: number;
|
||||
end: number;
|
||||
openLine: string;
|
||||
marker: string;
|
||||
indent: string;
|
||||
};
|
||||
|
||||
type FenceSplit = {
|
||||
closeFenceLine: string;
|
||||
reopenFenceLine: string;
|
||||
@@ -123,7 +121,10 @@ export class EmbeddedBlockChunker {
|
||||
if (preference === "paragraph") {
|
||||
let paragraphIdx = buffer.indexOf("\n\n");
|
||||
while (paragraphIdx !== -1) {
|
||||
if (paragraphIdx >= minChars && isSafeBreak(fenceSpans, paragraphIdx)) {
|
||||
if (
|
||||
paragraphIdx >= minChars &&
|
||||
isSafeFenceBreak(fenceSpans, paragraphIdx)
|
||||
) {
|
||||
return { index: paragraphIdx };
|
||||
}
|
||||
paragraphIdx = buffer.indexOf("\n\n", paragraphIdx + 2);
|
||||
@@ -133,7 +134,10 @@ export class EmbeddedBlockChunker {
|
||||
if (preference === "paragraph" || preference === "newline") {
|
||||
let newlineIdx = buffer.indexOf("\n");
|
||||
while (newlineIdx !== -1) {
|
||||
if (newlineIdx >= minChars && isSafeBreak(fenceSpans, newlineIdx)) {
|
||||
if (
|
||||
newlineIdx >= minChars &&
|
||||
isSafeFenceBreak(fenceSpans, newlineIdx)
|
||||
) {
|
||||
return { index: newlineIdx };
|
||||
}
|
||||
newlineIdx = buffer.indexOf("\n", newlineIdx + 1);
|
||||
@@ -147,7 +151,7 @@ export class EmbeddedBlockChunker {
|
||||
const at = match.index ?? -1;
|
||||
if (at < minChars) continue;
|
||||
const candidate = at + 1;
|
||||
if (isSafeBreak(fenceSpans, candidate)) {
|
||||
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
||||
sentenceIdx = candidate;
|
||||
}
|
||||
}
|
||||
@@ -168,7 +172,7 @@ export class EmbeddedBlockChunker {
|
||||
if (preference === "paragraph") {
|
||||
let paragraphIdx = window.lastIndexOf("\n\n");
|
||||
while (paragraphIdx >= minChars) {
|
||||
if (isSafeBreak(fenceSpans, paragraphIdx)) {
|
||||
if (isSafeFenceBreak(fenceSpans, paragraphIdx)) {
|
||||
return { index: paragraphIdx };
|
||||
}
|
||||
paragraphIdx = window.lastIndexOf("\n\n", paragraphIdx - 1);
|
||||
@@ -178,7 +182,7 @@ export class EmbeddedBlockChunker {
|
||||
if (preference === "paragraph" || preference === "newline") {
|
||||
let newlineIdx = window.lastIndexOf("\n");
|
||||
while (newlineIdx >= minChars) {
|
||||
if (isSafeBreak(fenceSpans, newlineIdx)) {
|
||||
if (isSafeFenceBreak(fenceSpans, newlineIdx)) {
|
||||
return { index: newlineIdx };
|
||||
}
|
||||
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
|
||||
@@ -192,7 +196,7 @@ export class EmbeddedBlockChunker {
|
||||
const at = match.index ?? -1;
|
||||
if (at < minChars) continue;
|
||||
const candidate = at + 1;
|
||||
if (isSafeBreak(fenceSpans, candidate)) {
|
||||
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
||||
sentenceIdx = candidate;
|
||||
}
|
||||
}
|
||||
@@ -200,13 +204,13 @@ export class EmbeddedBlockChunker {
|
||||
}
|
||||
|
||||
for (let i = window.length - 1; i >= minChars; i--) {
|
||||
if (/\s/.test(window[i]) && isSafeBreak(fenceSpans, i)) {
|
||||
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
|
||||
return { index: i };
|
||||
}
|
||||
}
|
||||
|
||||
if (buffer.length >= maxChars) {
|
||||
if (isSafeBreak(fenceSpans, maxChars)) return { index: maxChars };
|
||||
if (isSafeFenceBreak(fenceSpans, maxChars)) return { index: maxChars };
|
||||
const fence = findFenceSpanAt(fenceSpans, maxChars);
|
||||
if (fence) {
|
||||
return {
|
||||
@@ -229,76 +233,3 @@ function stripLeadingNewlines(value: string): string {
|
||||
while (i < value.length && value[i] === "\n") i++;
|
||||
return i > 0 ? value.slice(i) : value;
|
||||
}
|
||||
|
||||
function parseFenceSpans(buffer: string): FenceSpan[] {
|
||||
const spans: FenceSpan[] = [];
|
||||
let open:
|
||||
| {
|
||||
start: number;
|
||||
markerChar: string;
|
||||
markerLen: number;
|
||||
openLine: string;
|
||||
marker: string;
|
||||
indent: string;
|
||||
}
|
||||
| undefined;
|
||||
let offset = 0;
|
||||
while (offset <= buffer.length) {
|
||||
const nextNewline = buffer.indexOf("\n", offset);
|
||||
const lineEnd = nextNewline === -1 ? buffer.length : nextNewline;
|
||||
const line = buffer.slice(offset, lineEnd);
|
||||
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
|
||||
if (match) {
|
||||
const indent = match[1];
|
||||
const marker = match[2];
|
||||
const markerChar = marker[0];
|
||||
const markerLen = marker.length;
|
||||
if (!open) {
|
||||
open = {
|
||||
start: offset,
|
||||
markerChar,
|
||||
markerLen,
|
||||
openLine: line,
|
||||
marker,
|
||||
indent,
|
||||
};
|
||||
} else if (
|
||||
open.markerChar === markerChar &&
|
||||
markerLen >= open.markerLen
|
||||
) {
|
||||
const end = nextNewline === -1 ? buffer.length : nextNewline + 1;
|
||||
spans.push({
|
||||
start: open.start,
|
||||
end,
|
||||
openLine: open.openLine,
|
||||
marker: open.marker,
|
||||
indent: open.indent,
|
||||
});
|
||||
open = undefined;
|
||||
}
|
||||
}
|
||||
if (nextNewline === -1) break;
|
||||
offset = nextNewline + 1;
|
||||
}
|
||||
if (open) {
|
||||
spans.push({
|
||||
start: open.start,
|
||||
end: buffer.length,
|
||||
openLine: open.openLine,
|
||||
marker: open.marker,
|
||||
indent: open.indent,
|
||||
});
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
|
||||
function findFenceSpanAt(
|
||||
spans: FenceSpan[],
|
||||
index: number,
|
||||
): FenceSpan | undefined {
|
||||
return spans.find((span) => index > span.start && index < span.end);
|
||||
}
|
||||
|
||||
function isSafeBreak(spans: FenceSpan[], index: number): boolean {
|
||||
return !findFenceSpanAt(spans, index);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user