Chunking: avoid splits inside parentheses

This commit is contained in:
Philipp Spiess
2026-01-08 16:52:50 +01:00
committed by Peter Steinberger
parent 491f928a2e
commit 4082b90aa4
2 changed files with 64 additions and 18 deletions

View File

@@ -184,4 +184,29 @@ describe("chunkMarkdownText", () => {
expect(nonFenceLines.join("\n").trim()).not.toBe("");
}
});
it("keeps parenthetical phrases together", () => {
const text = "Heads up now (Though now I'm curious)ok";
const chunks = chunkMarkdownText(text, 35);
expect(chunks).toEqual(["Heads up now", "(Though now I'm curious)ok"]);
});
it("handles nested parentheses", () => {
const text = "Hello (outer (inner) end) world";
const chunks = chunkMarkdownText(text, 26);
expect(chunks).toEqual(["Hello (outer (inner) end)", "world"]);
});
it("hard-breaks when a parenthetical exceeds the limit", () => {
const text = `(${"a".repeat(80)})`;
const chunks = chunkMarkdownText(text, 20);
expect(chunks[0]?.length).toBe(20);
expect(chunks.join("")).toBe(text);
});
it("ignores unmatched closing parentheses", () => {
const text = "Hello) world (ok)";
const chunks = chunkMarkdownText(text, 12);
expect(chunks).toEqual(["Hello)", "world (ok)"]);
});
});

View File

@@ -90,18 +90,27 @@ export function chunkText(text: string, limit: number): string[] {
while (remaining.length > limit) {
const window = remaining.slice(0, limit);
// 1) Prefer a newline break inside the window.
let breakIdx = window.lastIndexOf("\n");
// 1) Prefer a newline break inside the window (outside parentheses).
let lastNewline = -1;
let lastWhitespace = -1;
let depth = 0;
for (let i = 0; i < window.length; i++) {
const char = window[i];
if (char === "(") {
depth += 1;
continue;
}
if (char === ")" && depth > 0) {
depth -= 1;
continue;
}
if (depth !== 0) continue;
if (char === "\n") lastNewline = i;
else if (/\s/.test(char)) lastWhitespace = i;
}
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
if (breakIdx <= 0) {
for (let i = window.length - 1; i >= 0; i--) {
if (/\s/.test(window[i])) {
breakIdx = i;
break;
}
}
}
let breakIdx = lastNewline > 0 ? lastNewline : lastWhitespace;
// 3) Fallback: hard break exactly at the limit.
if (breakIdx <= 0) breakIdx = limit;
@@ -234,15 +243,27 @@ function pickSafeBreakIndex(
window: string,
spans: ReturnType<typeof parseFenceSpans>,
): number {
let newlineIdx = window.lastIndexOf("\n");
while (newlineIdx > 0) {
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx;
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
}
for (let i = window.length - 1; i > 0; i--) {
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i;
let lastNewline = -1;
let lastWhitespace = -1;
let depth = 0;
for (let i = 0; i < window.length; i++) {
if (!isSafeFenceBreak(spans, i)) continue;
const char = window[i];
if (char === "(") {
depth += 1;
continue;
}
if (char === ")" && depth > 0) {
depth -= 1;
continue;
}
if (depth !== 0) continue;
if (char === "\n") lastNewline = i;
else if (/\s/.test(char)) lastWhitespace = i;
}
if (lastNewline > 0) return lastNewline;
if (lastWhitespace > 0) return lastWhitespace;
return -1;
}