Fix newline chunking: keep paragraphs/lists together
This commit is contained in:
committed by
Peter Steinberger
parent
22cf2b6766
commit
03e9a076b8
@@ -310,10 +310,16 @@ describe("chunkTextWithMode", () => {
|
|||||||
expect(chunks).toEqual(["Line one\nLine two"]);
|
expect(chunks).toEqual(["Line one\nLine two"]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("uses newline-based chunking for newline mode", () => {
|
it("uses paragraph-based chunking for newline mode", () => {
|
||||||
const text = "Line one\nLine two";
|
const text = "Line one\nLine two";
|
||||||
const chunks = chunkTextWithMode(text, 1000, "newline");
|
const chunks = chunkTextWithMode(text, 1000, "newline");
|
||||||
expect(chunks).toEqual(["Line one", "Line two"]);
|
expect(chunks).toEqual(["Line one\nLine two"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("splits on blank lines for newline mode", () => {
|
||||||
|
const text = "Para one\n\nPara two";
|
||||||
|
const chunks = chunkTextWithMode(text, 1000, "newline");
|
||||||
|
expect(chunks).toEqual(["Para one", "Para two"]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -323,17 +329,19 @@ describe("chunkMarkdownTextWithMode", () => {
|
|||||||
expect(chunkMarkdownTextWithMode(text, 1000, "length")).toEqual(chunkMarkdownText(text, 1000));
|
expect(chunkMarkdownTextWithMode(text, 1000, "length")).toEqual(chunkMarkdownText(text, 1000));
|
||||||
});
|
});
|
||||||
|
|
||||||
it("uses newline-based chunking for newline mode", () => {
|
it("uses paragraph-based chunking for newline mode", () => {
|
||||||
const text = "Line one\nLine two";
|
const text = "Line one\nLine two";
|
||||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one", "Line two"]);
|
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one\nLine two"]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("does not split inside code fences for newline mode", () => {
|
it("splits on blank lines for newline mode", () => {
|
||||||
|
const text = "Para one\n\nPara two";
|
||||||
|
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Para one", "Para two"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not split single-newline code fences in newline mode", () => {
|
||||||
const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
|
const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
|
||||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([
|
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||||
"```js\nconst a = 1;\nconst b = 2;\n```",
|
|
||||||
"After",
|
|
||||||
]);
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,9 @@ export type TextChunkProvider = ChannelId | typeof INTERNAL_MESSAGE_CHANNEL;
|
|||||||
/**
|
/**
|
||||||
* Chunking mode for outbound messages:
|
* Chunking mode for outbound messages:
|
||||||
* - "length": Split only when exceeding textChunkLimit (default)
|
* - "length": Split only when exceeding textChunkLimit (default)
|
||||||
* - "newline": Split on every newline, with fallback to length-based for long lines
|
* - "newline": Prefer breaking on "soft" boundaries. Historically this split on every
|
||||||
|
* newline; now it only breaks on paragraph boundaries (blank lines) unless the text
|
||||||
|
* exceeds the length limit.
|
||||||
*/
|
*/
|
||||||
export type ChunkMode = "length" | "newline";
|
export type ChunkMode = "length" | "newline";
|
||||||
|
|
||||||
@@ -164,44 +166,105 @@ export function chunkByNewline(
|
|||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split text into chunks on paragraph boundaries (blank lines), preserving lists and
|
||||||
|
* single-newline line wraps inside paragraphs.
|
||||||
|
*
|
||||||
|
* - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines)
|
||||||
|
* - Packs multiple paragraphs into a single chunk up to `limit`
|
||||||
|
* - Falls back to length-based splitting when a single paragraph exceeds `limit`
|
||||||
|
*/
|
||||||
|
export function chunkByParagraph(text: string, limit: number): string[] {
|
||||||
|
if (!text) return [];
|
||||||
|
if (limit <= 0) return [text];
|
||||||
|
if (text.length <= limit) return [text];
|
||||||
|
|
||||||
|
// Normalize to \n so blank line detection is consistent.
|
||||||
|
const normalized = text.replace(/\r\n?/g, "\n");
|
||||||
|
|
||||||
|
const parts: string[] = [];
|
||||||
|
const seps: string[] = [];
|
||||||
|
|
||||||
|
const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
|
||||||
|
let lastIndex = 0;
|
||||||
|
for (const match of normalized.matchAll(re)) {
|
||||||
|
const idx = match.index ?? 0;
|
||||||
|
parts.push(normalized.slice(lastIndex, idx));
|
||||||
|
seps.push(match[0]);
|
||||||
|
lastIndex = idx + match[0].length;
|
||||||
|
}
|
||||||
|
parts.push(normalized.slice(lastIndex));
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let current = "";
|
||||||
|
let pendingSep = "";
|
||||||
|
|
||||||
|
const flush = () => {
|
||||||
|
const out = current.trimEnd();
|
||||||
|
if (out) chunks.push(out);
|
||||||
|
current = "";
|
||||||
|
};
|
||||||
|
|
||||||
|
for (let i = 0; i < parts.length; i++) {
|
||||||
|
const paragraph = parts[i] ?? "";
|
||||||
|
if (!paragraph.trim() && i === parts.length - 1) break;
|
||||||
|
|
||||||
|
const prefix = pendingSep;
|
||||||
|
pendingSep = seps[i] ?? "";
|
||||||
|
|
||||||
|
const candidate = current
|
||||||
|
? `${current}${prefix}${paragraph}`
|
||||||
|
: // Cap leading blank lines so we never exceed `limit` with just prefixes.
|
||||||
|
`${prefix.slice(0, Math.max(0, limit - 1))}${paragraph}`;
|
||||||
|
|
||||||
|
if (candidate.length <= limit) {
|
||||||
|
current = candidate;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can't fit this paragraph into the current chunk.
|
||||||
|
if (current) flush();
|
||||||
|
|
||||||
|
const paragraphWithPrefix = `${prefix}${paragraph}`;
|
||||||
|
if (paragraphWithPrefix.length <= limit) {
|
||||||
|
current = paragraphWithPrefix;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Paragraph itself is too long; split it by length (preferring newlines/whitespace).
|
||||||
|
const prefixCap = prefix.slice(0, Math.max(0, limit - 1));
|
||||||
|
const remainingLimit = Math.max(1, limit - prefixCap.length);
|
||||||
|
const [first, ...rest] = chunkText(paragraph, remainingLimit);
|
||||||
|
if (first) chunks.push(prefixCap + first);
|
||||||
|
chunks.push(...rest);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.trim()) flush();
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unified chunking function that dispatches based on mode.
|
* Unified chunking function that dispatches based on mode.
|
||||||
*/
|
*/
|
||||||
export function chunkTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
|
export function chunkTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
|
||||||
if (mode === "newline") {
|
if (mode === "newline") {
|
||||||
const chunks: string[] = [];
|
return chunkByParagraph(text, limit);
|
||||||
const lineChunks = chunkByNewline(text, limit, { splitLongLines: false });
|
|
||||||
for (const line of lineChunks) {
|
|
||||||
const nested = chunkText(line, limit);
|
|
||||||
if (!nested.length && line) {
|
|
||||||
chunks.push(line);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
chunks.push(...nested);
|
|
||||||
}
|
|
||||||
return chunks;
|
|
||||||
}
|
}
|
||||||
return chunkText(text, limit);
|
return chunkText(text, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function chunkMarkdownTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
|
export function chunkMarkdownTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
|
||||||
if (mode === "newline") {
|
if (mode === "newline") {
|
||||||
const spans = parseFenceSpans(text);
|
// Paragraph chunking is fence-safe because we never split at arbitrary indices.
|
||||||
const chunks: string[] = [];
|
// If a paragraph must be split by length, defer to the markdown-aware chunker.
|
||||||
const lineChunks = chunkByNewline(text, limit, {
|
const paragraphChunks = chunkByParagraph(text, limit);
|
||||||
splitLongLines: false,
|
const out: string[] = [];
|
||||||
trimLines: false,
|
for (const chunk of paragraphChunks) {
|
||||||
isSafeBreak: (index) => isSafeFenceBreak(spans, index),
|
const nested = chunkMarkdownText(chunk, limit);
|
||||||
});
|
if (!nested.length && chunk) out.push(chunk);
|
||||||
for (const line of lineChunks) {
|
else out.push(...nested);
|
||||||
const nested = chunkMarkdownText(line, limit);
|
|
||||||
if (!nested.length && line) {
|
|
||||||
chunks.push(line);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
chunks.push(...nested);
|
|
||||||
}
|
}
|
||||||
return chunks;
|
return out;
|
||||||
}
|
}
|
||||||
return chunkMarkdownText(text, limit);
|
return chunkMarkdownText(text, limit);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import {
|
import {
|
||||||
chunkByNewline,
|
chunkByParagraph,
|
||||||
chunkMarkdownTextWithMode,
|
chunkMarkdownTextWithMode,
|
||||||
resolveChunkMode,
|
resolveChunkMode,
|
||||||
resolveTextChunkLimit,
|
resolveTextChunkLimit,
|
||||||
@@ -239,14 +239,15 @@ export async function deliverOutboundPayloads(params: {
|
|||||||
}
|
}
|
||||||
if (chunkMode === "newline") {
|
if (chunkMode === "newline") {
|
||||||
const mode = handler.chunkerMode ?? "text";
|
const mode = handler.chunkerMode ?? "text";
|
||||||
const lineChunks =
|
const blockChunks =
|
||||||
mode === "markdown"
|
mode === "markdown"
|
||||||
? chunkMarkdownTextWithMode(text, textLimit, "newline")
|
? chunkMarkdownTextWithMode(text, textLimit, "newline")
|
||||||
: chunkByNewline(text, textLimit, { splitLongLines: false });
|
: chunkByParagraph(text, textLimit);
|
||||||
if (!lineChunks.length && text) lineChunks.push(text);
|
|
||||||
for (const lineChunk of lineChunks) {
|
if (!blockChunks.length && text) blockChunks.push(text);
|
||||||
const chunks = handler.chunker(lineChunk, textLimit);
|
for (const blockChunk of blockChunks) {
|
||||||
if (!chunks.length && lineChunk) chunks.push(lineChunk);
|
const chunks = handler.chunker(blockChunk, textLimit);
|
||||||
|
if (!chunks.length && blockChunk) chunks.push(blockChunk);
|
||||||
for (const chunk of chunks) {
|
for (const chunk of chunks) {
|
||||||
throwIfAborted(abortSignal);
|
throwIfAborted(abortSignal);
|
||||||
results.push(await handler.sendText(chunk));
|
results.push(await handler.sendText(chunk));
|
||||||
|
|||||||
Reference in New Issue
Block a user