Fix newline chunking: split on blank lines even under limit
This commit is contained in:
committed by
Peter Steinberger
parent
46fa1c1301
commit
0975aa4a7c
@@ -177,69 +177,39 @@ export function chunkByNewline(
|
|||||||
export function chunkByParagraph(text: string, limit: number): string[] {
|
export function chunkByParagraph(text: string, limit: number): string[] {
|
||||||
if (!text) return [];
|
if (!text) return [];
|
||||||
if (limit <= 0) return [text];
|
if (limit <= 0) return [text];
|
||||||
if (text.length <= limit) return [text];
|
|
||||||
|
|
||||||
// Normalize to \n so blank line detection is consistent.
|
// Normalize to \n so blank line detection is consistent.
|
||||||
const normalized = text.replace(/\r\n?/g, "\n");
|
const normalized = text.replace(/\r\n?/g, "\n");
|
||||||
|
|
||||||
const parts: string[] = [];
|
// Fast-path: if there are no blank-line paragraph separators, do not split.
|
||||||
const seps: string[] = [];
|
// (We *do not* early-return based on `limit` — newline mode is about paragraph
|
||||||
|
// boundaries, not only exceeding a length limit.)
|
||||||
|
const paragraphRe = /\n[\t ]*\n+/;
|
||||||
|
if (!paragraphRe.test(normalized)) {
|
||||||
|
return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
const parts: string[] = [];
|
||||||
const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
|
const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
|
||||||
let lastIndex = 0;
|
let lastIndex = 0;
|
||||||
for (const match of normalized.matchAll(re)) {
|
for (const match of normalized.matchAll(re)) {
|
||||||
const idx = match.index ?? 0;
|
const idx = match.index ?? 0;
|
||||||
parts.push(normalized.slice(lastIndex, idx));
|
parts.push(normalized.slice(lastIndex, idx));
|
||||||
seps.push(match[0]);
|
|
||||||
lastIndex = idx + match[0].length;
|
lastIndex = idx + match[0].length;
|
||||||
}
|
}
|
||||||
parts.push(normalized.slice(lastIndex));
|
parts.push(normalized.slice(lastIndex));
|
||||||
|
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
let current = "";
|
for (const part of parts) {
|
||||||
let pendingSep = "";
|
const paragraph = part.replace(/\s+$/g, "");
|
||||||
|
if (!paragraph.trim()) continue;
|
||||||
const flush = () => {
|
if (paragraph.length <= limit) {
|
||||||
const out = current.trimEnd();
|
chunks.push(paragraph);
|
||||||
if (out) chunks.push(out);
|
} else {
|
||||||
current = "";
|
chunks.push(...chunkText(paragraph, limit));
|
||||||
};
|
|
||||||
|
|
||||||
for (let i = 0; i < parts.length; i++) {
|
|
||||||
const paragraph = parts[i] ?? "";
|
|
||||||
if (!paragraph.trim() && i === parts.length - 1) break;
|
|
||||||
|
|
||||||
const prefix = pendingSep;
|
|
||||||
pendingSep = seps[i] ?? "";
|
|
||||||
|
|
||||||
const candidate = current
|
|
||||||
? `${current}${prefix}${paragraph}`
|
|
||||||
: // Cap leading blank lines so we never exceed `limit` with just prefixes.
|
|
||||||
`${prefix.slice(0, Math.max(0, limit - 1))}${paragraph}`;
|
|
||||||
|
|
||||||
if (candidate.length <= limit) {
|
|
||||||
current = candidate;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Can't fit this paragraph into the current chunk.
|
|
||||||
if (current) flush();
|
|
||||||
|
|
||||||
const paragraphWithPrefix = `${prefix}${paragraph}`;
|
|
||||||
if (paragraphWithPrefix.length <= limit) {
|
|
||||||
current = paragraphWithPrefix;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Paragraph itself is too long; split it by length (preferring newlines/whitespace).
|
|
||||||
const prefixCap = prefix.slice(0, Math.max(0, limit - 1));
|
|
||||||
const remainingLimit = Math.max(1, limit - prefixCap.length);
|
|
||||||
const [first, ...rest] = chunkText(paragraph, remainingLimit);
|
|
||||||
if (first) chunks.push(prefixCap + first);
|
|
||||||
chunks.push(...rest);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.trim()) flush();
|
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user