fix: paragraph-aware newline chunking (#1726)

Thanks @tyler6204

Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-01-25 13:24:00 +00:00
parent c3f5b4c416
commit 0130ecd800
17 changed files with 39 additions and 24 deletions

View File

@@ -344,6 +344,11 @@ describe("chunkMarkdownTextWithMode", () => {
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
});
it("defers long markdown paragraphs to markdown chunking in newline mode", () => {
const text = `\`\`\`js\n${"const a = 1;\n".repeat(20)}\`\`\``;
expect(chunkMarkdownTextWithMode(text, 40, "newline")).toEqual(chunkMarkdownText(text, 40));
});
it("does not split on blank lines inside a fenced code block", () => {
const text = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```";
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);

View File

@@ -173,10 +173,16 @@ export function chunkByNewline(
* - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines)
* - Packs multiple paragraphs into a single chunk up to `limit`
* - Falls back to length-based splitting when a single paragraph exceeds `limit`
* (unless `splitLongParagraphs` is disabled)
*/
export function chunkByParagraph(text: string, limit: number): string[] {
export function chunkByParagraph(
text: string,
limit: number,
opts?: { splitLongParagraphs?: boolean },
): string[] {
if (!text) return [];
if (limit <= 0) return [text];
const splitLongParagraphs = opts?.splitLongParagraphs !== false;
// Normalize to \n so blank line detection is consistent.
const normalized = text.replace(/\r\n?/g, "\n");
@@ -186,7 +192,9 @@ export function chunkByParagraph(text: string, limit: number): string[] {
// boundaries, not only exceeding a length limit.)
const paragraphRe = /\n[\t ]*\n+/;
if (!paragraphRe.test(normalized)) {
return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
if (normalized.length <= limit) return [normalized];
if (!splitLongParagraphs) return [normalized];
return chunkText(normalized, limit);
}
const spans = parseFenceSpans(normalized);
@@ -213,6 +221,8 @@ export function chunkByParagraph(text: string, limit: number): string[] {
if (!paragraph.trim()) continue;
if (paragraph.length <= limit) {
chunks.push(paragraph);
} else if (!splitLongParagraphs) {
chunks.push(paragraph);
} else {
chunks.push(...chunkText(paragraph, limit));
}
@@ -235,7 +245,7 @@ export function chunkMarkdownTextWithMode(text: string, limit: number, mode: Chu
if (mode === "newline") {
// Paragraph chunking is fence-safe because we never split at arbitrary indices.
// If a paragraph must be split by length, defer to the markdown-aware chunker.
const paragraphChunks = chunkByParagraph(text, limit);
const paragraphChunks = chunkByParagraph(text, limit, { splitLongParagraphs: false });
const out: string[] = [];
for (const chunk of paragraphChunks) {
const nested = chunkMarkdownText(chunk, limit);