Chunking: avoid splits inside parentheses
This commit is contained in:
committed by
Peter Steinberger
parent
491f928a2e
commit
4082b90aa4
@@ -184,4 +184,29 @@ describe("chunkMarkdownText", () => {
|
|||||||
expect(nonFenceLines.join("\n").trim()).not.toBe("");
|
expect(nonFenceLines.join("\n").trim()).not.toBe("");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("keeps parenthetical phrases together", () => {
|
||||||
|
const text = "Heads up now (Though now I'm curious)ok";
|
||||||
|
const chunks = chunkMarkdownText(text, 35);
|
||||||
|
expect(chunks).toEqual(["Heads up now", "(Though now I'm curious)ok"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles nested parentheses", () => {
|
||||||
|
const text = "Hello (outer (inner) end) world";
|
||||||
|
const chunks = chunkMarkdownText(text, 26);
|
||||||
|
expect(chunks).toEqual(["Hello (outer (inner) end)", "world"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("hard-breaks when a parenthetical exceeds the limit", () => {
|
||||||
|
const text = `(${"a".repeat(80)})`;
|
||||||
|
const chunks = chunkMarkdownText(text, 20);
|
||||||
|
expect(chunks[0]?.length).toBe(20);
|
||||||
|
expect(chunks.join("")).toBe(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("ignores unmatched closing parentheses", () => {
|
||||||
|
const text = "Hello) world (ok)";
|
||||||
|
const chunks = chunkMarkdownText(text, 12);
|
||||||
|
expect(chunks).toEqual(["Hello)", "world (ok)"]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -90,18 +90,27 @@ export function chunkText(text: string, limit: number): string[] {
|
|||||||
while (remaining.length > limit) {
|
while (remaining.length > limit) {
|
||||||
const window = remaining.slice(0, limit);
|
const window = remaining.slice(0, limit);
|
||||||
|
|
||||||
// 1) Prefer a newline break inside the window.
|
// 1) Prefer a newline break inside the window (outside parentheses).
|
||||||
let breakIdx = window.lastIndexOf("\n");
|
let lastNewline = -1;
|
||||||
|
let lastWhitespace = -1;
|
||||||
|
let depth = 0;
|
||||||
|
for (let i = 0; i < window.length; i++) {
|
||||||
|
const char = window[i];
|
||||||
|
if (char === "(") {
|
||||||
|
depth += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (char === ")" && depth > 0) {
|
||||||
|
depth -= 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (depth !== 0) continue;
|
||||||
|
if (char === "\n") lastNewline = i;
|
||||||
|
else if (/\s/.test(char)) lastWhitespace = i;
|
||||||
|
}
|
||||||
|
|
||||||
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
|
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
|
||||||
if (breakIdx <= 0) {
|
let breakIdx = lastNewline > 0 ? lastNewline : lastWhitespace;
|
||||||
for (let i = window.length - 1; i >= 0; i--) {
|
|
||||||
if (/\s/.test(window[i])) {
|
|
||||||
breakIdx = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3) Fallback: hard break exactly at the limit.
|
// 3) Fallback: hard break exactly at the limit.
|
||||||
if (breakIdx <= 0) breakIdx = limit;
|
if (breakIdx <= 0) breakIdx = limit;
|
||||||
@@ -234,15 +243,27 @@ function pickSafeBreakIndex(
|
|||||||
window: string,
|
window: string,
|
||||||
spans: ReturnType<typeof parseFenceSpans>,
|
spans: ReturnType<typeof parseFenceSpans>,
|
||||||
): number {
|
): number {
|
||||||
let newlineIdx = window.lastIndexOf("\n");
|
let lastNewline = -1;
|
||||||
while (newlineIdx > 0) {
|
let lastWhitespace = -1;
|
||||||
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx;
|
let depth = 0;
|
||||||
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
|
|
||||||
}
|
for (let i = 0; i < window.length; i++) {
|
||||||
|
if (!isSafeFenceBreak(spans, i)) continue;
|
||||||
for (let i = window.length - 1; i > 0; i--) {
|
const char = window[i];
|
||||||
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i;
|
if (char === "(") {
|
||||||
|
depth += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (char === ")" && depth > 0) {
|
||||||
|
depth -= 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (depth !== 0) continue;
|
||||||
|
if (char === "\n") lastNewline = i;
|
||||||
|
else if (/\s/.test(char)) lastWhitespace = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (lastNewline > 0) return lastNewline;
|
||||||
|
if (lastWhitespace > 0) return lastWhitespace;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user