refactor: share paren-aware chunk break scan
This commit is contained in:
@@ -25,6 +25,45 @@ function expectFencesBalanced(chunks: string[]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ChunkCase = {
|
||||||
|
name: string;
|
||||||
|
text: string;
|
||||||
|
limit: number;
|
||||||
|
expected: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
function runChunkCases(
|
||||||
|
chunker: (text: string, limit: number) => string[],
|
||||||
|
cases: ChunkCase[],
|
||||||
|
) {
|
||||||
|
for (const { name, text, limit, expected } of cases) {
|
||||||
|
it(name, () => {
|
||||||
|
expect(chunker(text, limit)).toEqual(expected);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const parentheticalCases: ChunkCase[] = [
|
||||||
|
{
|
||||||
|
name: "keeps parenthetical phrases together",
|
||||||
|
text: "Heads up now (Though now I'm curious)ok",
|
||||||
|
limit: 35,
|
||||||
|
expected: ["Heads up now", "(Though now I'm curious)ok"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "handles nested parentheses",
|
||||||
|
text: "Hello (outer (inner) end) world",
|
||||||
|
limit: 26,
|
||||||
|
expected: ["Hello (outer (inner) end)", "world"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ignores unmatched closing parentheses",
|
||||||
|
text: "Hello) world (ok)",
|
||||||
|
limit: 12,
|
||||||
|
expected: ["Hello)", "world (ok)"],
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
describe("chunkText", () => {
|
describe("chunkText", () => {
|
||||||
it("keeps multi-line text in one chunk when under limit", () => {
|
it("keeps multi-line text in one chunk when under limit", () => {
|
||||||
const text = "Line one\n\nLine two\n\nLine three";
|
const text = "Line one\n\nLine two\n\nLine three";
|
||||||
@@ -68,11 +107,7 @@ describe("chunkText", () => {
|
|||||||
expect(chunks).toEqual(["Supercalif", "ragilistic", "expialidoc", "ious"]);
|
expect(chunks).toEqual(["Supercalif", "ragilistic", "expialidoc", "ious"]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("keeps parenthetical phrases together", () => {
|
runChunkCases(chunkText, [parentheticalCases[0]]);
|
||||||
const text = "Heads up now (Though now I'm curious)ok";
|
|
||||||
const chunks = chunkText(text, 35);
|
|
||||||
expect(chunks).toEqual(["Heads up now", "(Though now I'm curious)ok"]);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("resolveTextChunkLimit", () => {
|
describe("resolveTextChunkLimit", () => {
|
||||||
@@ -191,17 +226,7 @@ describe("chunkMarkdownText", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
it("keeps parenthetical phrases together", () => {
|
runChunkCases(chunkMarkdownText, parentheticalCases);
|
||||||
const text = "Heads up now (Though now I'm curious)ok";
|
|
||||||
const chunks = chunkMarkdownText(text, 35);
|
|
||||||
expect(chunks).toEqual(["Heads up now", "(Though now I'm curious)ok"]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("handles nested parentheses", () => {
|
|
||||||
const text = "Hello (outer (inner) end) world";
|
|
||||||
const chunks = chunkMarkdownText(text, 26);
|
|
||||||
expect(chunks).toEqual(["Hello (outer (inner) end)", "world"]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("hard-breaks when a parenthetical exceeds the limit", () => {
|
it("hard-breaks when a parenthetical exceeds the limit", () => {
|
||||||
const text = `(${"a".repeat(80)})`;
|
const text = `(${"a".repeat(80)})`;
|
||||||
@@ -209,10 +234,4 @@ describe("chunkMarkdownText", () => {
|
|||||||
expect(chunks[0]?.length).toBe(20);
|
expect(chunks[0]?.length).toBe(20);
|
||||||
expect(chunks.join("")).toBe(text);
|
expect(chunks.join("")).toBe(text);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("ignores unmatched closing parentheses", () => {
|
|
||||||
const text = "Hello) world (ok)";
|
|
||||||
const chunks = chunkMarkdownText(text, 12);
|
|
||||||
expect(chunks).toEqual(["Hello)", "world (ok)"]);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -91,23 +91,7 @@ export function chunkText(text: string, limit: number): string[] {
|
|||||||
const window = remaining.slice(0, limit);
|
const window = remaining.slice(0, limit);
|
||||||
|
|
||||||
// 1) Prefer a newline break inside the window (outside parentheses).
|
// 1) Prefer a newline break inside the window (outside parentheses).
|
||||||
let lastNewline = -1;
|
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window);
|
||||||
let lastWhitespace = -1;
|
|
||||||
let depth = 0;
|
|
||||||
for (let i = 0; i < window.length; i++) {
|
|
||||||
const char = window[i];
|
|
||||||
if (char === "(") {
|
|
||||||
depth += 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (char === ")" && depth > 0) {
|
|
||||||
depth -= 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (depth !== 0) continue;
|
|
||||||
if (char === "\n") lastNewline = i;
|
|
||||||
else if (/\s/.test(char)) lastWhitespace = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
|
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
|
||||||
let breakIdx = lastNewline > 0 ? lastNewline : lastWhitespace;
|
let breakIdx = lastNewline > 0 ? lastNewline : lastWhitespace;
|
||||||
@@ -243,12 +227,26 @@ function pickSafeBreakIndex(
|
|||||||
window: string,
|
window: string,
|
||||||
spans: ReturnType<typeof parseFenceSpans>,
|
spans: ReturnType<typeof parseFenceSpans>,
|
||||||
): number {
|
): number {
|
||||||
|
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(
|
||||||
|
window,
|
||||||
|
(index) => isSafeFenceBreak(spans, index),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (lastNewline > 0) return lastNewline;
|
||||||
|
if (lastWhitespace > 0) return lastWhitespace;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
function scanParenAwareBreakpoints(
|
||||||
|
window: string,
|
||||||
|
isAllowed: (index: number) => boolean = () => true,
|
||||||
|
): { lastNewline: number; lastWhitespace: number } {
|
||||||
let lastNewline = -1;
|
let lastNewline = -1;
|
||||||
let lastWhitespace = -1;
|
let lastWhitespace = -1;
|
||||||
let depth = 0;
|
let depth = 0;
|
||||||
|
|
||||||
for (let i = 0; i < window.length; i++) {
|
for (let i = 0; i < window.length; i++) {
|
||||||
if (!isSafeFenceBreak(spans, i)) continue;
|
if (!isAllowed(i)) continue;
|
||||||
const char = window[i];
|
const char = window[i];
|
||||||
if (char === "(") {
|
if (char === "(") {
|
||||||
depth += 1;
|
depth += 1;
|
||||||
@@ -263,7 +261,5 @@ function pickSafeBreakIndex(
|
|||||||
else if (/\s/.test(char)) lastWhitespace = i;
|
else if (/\s/.test(char)) lastWhitespace = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lastNewline > 0) return lastNewline;
|
return { lastNewline, lastWhitespace };
|
||||||
if (lastWhitespace > 0) return lastWhitespace;
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user