Fix paragraph chunking to ignore blank lines inside code fences
This commit is contained in:
committed by
Peter Steinberger
parent
0975aa4a7c
commit
c3f5b4c416
@@ -343,6 +343,17 @@ describe("chunkMarkdownTextWithMode", () => {
|
|||||||
const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
|
const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
|
||||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("does not split on blank lines inside a fenced code block", () => {
|
||||||
|
const text = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```";
|
||||||
|
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("splits on blank lines between a code fence and following paragraph", () => {
|
||||||
|
const fence = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```";
|
||||||
|
const text = `${fence}\n\nAfter`;
|
||||||
|
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([fence, "After"]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("resolveChunkMode", () => {
|
describe("resolveChunkMode", () => {
|
||||||
|
|||||||
@@ -189,11 +189,19 @@ export function chunkByParagraph(text: string, limit: number): string[] {
|
|||||||
return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
|
return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const spans = parseFenceSpans(normalized);
|
||||||
|
|
||||||
const parts: string[] = [];
|
const parts: string[] = [];
|
||||||
const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
|
const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
|
||||||
let lastIndex = 0;
|
let lastIndex = 0;
|
||||||
for (const match of normalized.matchAll(re)) {
|
for (const match of normalized.matchAll(re)) {
|
||||||
const idx = match.index ?? 0;
|
const idx = match.index ?? 0;
|
||||||
|
|
||||||
|
// Do not split on blank lines that occur inside fenced code blocks.
|
||||||
|
if (!isSafeFenceBreak(spans, idx)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
parts.push(normalized.slice(lastIndex, idx));
|
parts.push(normalized.slice(lastIndex, idx));
|
||||||
lastIndex = idx + match[0].length;
|
lastIndex = idx + match[0].length;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user