Chunking: avoid splits inside parentheses

This commit is contained in:
Philipp Spiess
2026-01-08 16:52:50 +01:00
committed by Peter Steinberger
parent 491f928a2e
commit 4082b90aa4
2 changed files with 64 additions and 18 deletions

View File

@@ -184,4 +184,29 @@ describe("chunkMarkdownText", () => {
expect(nonFenceLines.join("\n").trim()).not.toBe(""); expect(nonFenceLines.join("\n").trim()).not.toBe("");
} }
}); });
it("keeps parenthetical phrases together", () => {
const text = "Heads up now (Though now I'm curious)ok";
const chunks = chunkMarkdownText(text, 35);
expect(chunks).toEqual(["Heads up now", "(Though now I'm curious)ok"]);
});
it("handles nested parentheses", () => {
const text = "Hello (outer (inner) end) world";
const chunks = chunkMarkdownText(text, 26);
expect(chunks).toEqual(["Hello (outer (inner) end)", "world"]);
});
it("hard-breaks when a parenthetical exceeds the limit", () => {
const text = `(${"a".repeat(80)})`;
const chunks = chunkMarkdownText(text, 20);
expect(chunks[0]?.length).toBe(20);
expect(chunks.join("")).toBe(text);
});
it("ignores unmatched closing parentheses", () => {
const text = "Hello) world (ok)";
const chunks = chunkMarkdownText(text, 12);
expect(chunks).toEqual(["Hello)", "world (ok)"]);
});
}); });

View File

@@ -90,18 +90,27 @@ export function chunkText(text: string, limit: number): string[] {
while (remaining.length > limit) { while (remaining.length > limit) {
const window = remaining.slice(0, limit); const window = remaining.slice(0, limit);
// 1) Prefer a newline break inside the window. // 1) Prefer a newline break inside the window (outside parentheses).
let breakIdx = window.lastIndexOf("\n"); let lastNewline = -1;
let lastWhitespace = -1;
let depth = 0;
for (let i = 0; i < window.length; i++) {
const char = window[i];
if (char === "(") {
depth += 1;
continue;
}
if (char === ")" && depth > 0) {
depth -= 1;
continue;
}
if (depth !== 0) continue;
if (char === "\n") lastNewline = i;
else if (/\s/.test(char)) lastWhitespace = i;
}
// 2) Otherwise prefer the last whitespace (word boundary) inside the window. // 2) Otherwise prefer the last whitespace (word boundary) inside the window.
if (breakIdx <= 0) { let breakIdx = lastNewline > 0 ? lastNewline : lastWhitespace;
for (let i = window.length - 1; i >= 0; i--) {
if (/\s/.test(window[i])) {
breakIdx = i;
break;
}
}
}
// 3) Fallback: hard break exactly at the limit. // 3) Fallback: hard break exactly at the limit.
if (breakIdx <= 0) breakIdx = limit; if (breakIdx <= 0) breakIdx = limit;
@@ -234,15 +243,27 @@ function pickSafeBreakIndex(
window: string, window: string,
spans: ReturnType<typeof parseFenceSpans>, spans: ReturnType<typeof parseFenceSpans>,
): number { ): number {
let newlineIdx = window.lastIndexOf("\n"); let lastNewline = -1;
while (newlineIdx > 0) { let lastWhitespace = -1;
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx; let depth = 0;
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
} for (let i = 0; i < window.length; i++) {
if (!isSafeFenceBreak(spans, i)) continue;
for (let i = window.length - 1; i > 0; i--) { const char = window[i];
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i; if (char === "(") {
depth += 1;
continue;
}
if (char === ")" && depth > 0) {
depth -= 1;
continue;
}
if (depth !== 0) continue;
if (char === "\n") lastNewline = i;
else if (/\s/.test(char)) lastWhitespace = i;
} }
if (lastNewline > 0) return lastNewline;
if (lastWhitespace > 0) return lastWhitespace;
return -1; return -1;
} }