Fix paragraph chunking to ignore blank lines inside code fences
This commit is contained in:
committed by
Peter Steinberger
parent
0975aa4a7c
commit
c3f5b4c416
@@ -343,6 +343,17 @@ describe("chunkMarkdownTextWithMode", () => {
|
||||
const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||
});
|
||||
|
||||
it("does not split on blank lines inside a fenced code block", () => {
|
||||
const text = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```";
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||
});
|
||||
|
||||
it("splits on blank lines between a code fence and following paragraph", () => {
|
||||
const fence = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```";
|
||||
const text = `${fence}\n\nAfter`;
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([fence, "After"]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveChunkMode", () => {
|
||||
|
||||
@@ -189,11 +189,19 @@ export function chunkByParagraph(text: string, limit: number): string[] {
|
||||
return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
|
||||
}
|
||||
|
||||
const spans = parseFenceSpans(normalized);
|
||||
|
||||
const parts: string[] = [];
|
||||
const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
|
||||
let lastIndex = 0;
|
||||
for (const match of normalized.matchAll(re)) {
|
||||
const idx = match.index ?? 0;
|
||||
|
||||
// Do not split on blank lines that occur inside fenced code blocks.
|
||||
if (!isSafeFenceBreak(spans, idx)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
parts.push(normalized.slice(lastIndex, idx));
|
||||
lastIndex = idx + match[0].length;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user