From 03e9a076b8db919a878f6eb31d2110c344667d8e Mon Sep 17 00:00:00 2001 From: Tyler Yust Date: Sat, 24 Jan 2026 22:06:22 -0800 Subject: [PATCH] Fix newline chunking: keep paragraphs/lists together --- src/auto-reply/chunk.test.ts | 26 +++++--- src/auto-reply/chunk.ts | 117 ++++++++++++++++++++++++++-------- src/infra/outbound/deliver.ts | 15 +++-- 3 files changed, 115 insertions(+), 43 deletions(-) diff --git a/src/auto-reply/chunk.test.ts b/src/auto-reply/chunk.test.ts index 01069d852..f2256c082 100644 --- a/src/auto-reply/chunk.test.ts +++ b/src/auto-reply/chunk.test.ts @@ -310,10 +310,16 @@ describe("chunkTextWithMode", () => { expect(chunks).toEqual(["Line one\nLine two"]); }); - it("uses newline-based chunking for newline mode", () => { + it("uses paragraph-based chunking for newline mode", () => { const text = "Line one\nLine two"; const chunks = chunkTextWithMode(text, 1000, "newline"); - expect(chunks).toEqual(["Line one", "Line two"]); + expect(chunks).toEqual(["Line one\nLine two"]); + }); + + it("splits on blank lines for newline mode", () => { + const text = "Para one\n\nPara two"; + const chunks = chunkTextWithMode(text, 1000, "newline"); + expect(chunks).toEqual(["Para one", "Para two"]); }); }); @@ -323,17 +329,19 @@ describe("chunkMarkdownTextWithMode", () => { expect(chunkMarkdownTextWithMode(text, 1000, "length")).toEqual(chunkMarkdownText(text, 1000)); }); - it("uses newline-based chunking for newline mode", () => { + it("uses paragraph-based chunking for newline mode", () => { const text = "Line one\nLine two"; - expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one", "Line two"]); + expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one\nLine two"]); }); - it("does not split inside code fences for newline mode", () => { + it("splits on blank lines for newline mode", () => { + const text = "Para one\n\nPara two"; + expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Para one", "Para two"]); + }); + + it("does not split single-newline code fences in newline mode", () => { const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter"; - expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([ - "```js\nconst a = 1;\nconst b = 2;\n```", - "After", - ]); + expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]); }); }); diff --git a/src/auto-reply/chunk.ts b/src/auto-reply/chunk.ts index c77c0cd9f..8da1b9f9c 100644 --- a/src/auto-reply/chunk.ts +++ b/src/auto-reply/chunk.ts @@ -13,7 +13,9 @@ export type TextChunkProvider = ChannelId | typeof INTERNAL_MESSAGE_CHANNEL; /** * Chunking mode for outbound messages: * - "length": Split only when exceeding textChunkLimit (default) - * - "newline": Split on every newline, with fallback to length-based for long lines + * - "newline": Prefer breaking on "soft" boundaries. Historically this split on every + * newline; now it only breaks on paragraph boundaries (blank lines) unless the text + * exceeds the length limit. */ export type ChunkMode = "length" | "newline"; @@ -164,44 +166,105 @@ export function chunkByNewline( return chunks; } +/** + * Split text into chunks on paragraph boundaries (blank lines), preserving lists and + * single-newline line wraps inside paragraphs. + * + * - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines) + * - Packs multiple paragraphs into a single chunk up to `limit` + * - Falls back to length-based splitting when a single paragraph exceeds `limit` + */ +export function chunkByParagraph(text: string, limit: number): string[] { + if (!text) return []; + if (limit <= 0) return [text]; + if (text.length <= limit) return [text]; + + // Normalize to \n so blank line detection is consistent. + const normalized = text.replace(/\r\n?/g, "\n"); + + const parts: string[] = []; + const seps: string[] = []; + + const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace + let lastIndex = 0; + for (const match of normalized.matchAll(re)) { + const idx = match.index ?? 0; + parts.push(normalized.slice(lastIndex, idx)); + seps.push(match[0]); + lastIndex = idx + match[0].length; + } + parts.push(normalized.slice(lastIndex)); + + const chunks: string[] = []; + let current = ""; + let pendingSep = ""; + + const flush = () => { + const out = current.trimEnd(); + if (out) chunks.push(out); + current = ""; + }; + + for (let i = 0; i < parts.length; i++) { + const paragraph = parts[i] ?? ""; + if (!paragraph.trim() && i === parts.length - 1) break; + + const prefix = pendingSep; + pendingSep = seps[i] ?? ""; + + const candidate = current + ? `${current}${prefix}${paragraph}` + : // Cap leading blank lines so we never exceed `limit` with just prefixes. + `${prefix.slice(0, Math.max(0, limit - 1))}${paragraph}`; + + if (candidate.length <= limit) { + current = candidate; + continue; + } + + // Can't fit this paragraph into the current chunk. + if (current) flush(); + + const paragraphWithPrefix = `${prefix}${paragraph}`; + if (paragraphWithPrefix.length <= limit) { + current = paragraphWithPrefix; + continue; + } + + // Paragraph itself is too long; split it by length (preferring newlines/whitespace). + const prefixCap = prefix.slice(0, Math.max(0, limit - 1)); + const remainingLimit = Math.max(1, limit - prefixCap.length); + const [first, ...rest] = chunkText(paragraph, remainingLimit); + if (first) chunks.push(prefixCap + first); + chunks.push(...rest); + } + + if (current.trim()) flush(); + return chunks; +} + /** * Unified chunking function that dispatches based on mode. */ export function chunkTextWithMode(text: string, limit: number, mode: ChunkMode): string[] { if (mode === "newline") { - const chunks: string[] = []; - const lineChunks = chunkByNewline(text, limit, { splitLongLines: false }); - for (const line of lineChunks) { - const nested = chunkText(line, limit); - if (!nested.length && line) { - chunks.push(line); - continue; - } - chunks.push(...nested); - } - return chunks; + return chunkByParagraph(text, limit); } return chunkText(text, limit); } export function chunkMarkdownTextWithMode(text: string, limit: number, mode: ChunkMode): string[] { if (mode === "newline") { - const spans = parseFenceSpans(text); - const chunks: string[] = []; - const lineChunks = chunkByNewline(text, limit, { - splitLongLines: false, - trimLines: false, - isSafeBreak: (index) => isSafeFenceBreak(spans, index), - }); - for (const line of lineChunks) { - const nested = chunkMarkdownText(line, limit); - if (!nested.length && line) { - chunks.push(line); - continue; - } - chunks.push(...nested); + // Paragraph chunking is fence-safe because we never split at arbitrary indices. + // If a paragraph must be split by length, defer to the markdown-aware chunker. + const paragraphChunks = chunkByParagraph(text, limit); + const out: string[] = []; + for (const chunk of paragraphChunks) { + const nested = chunkMarkdownText(chunk, limit); + if (!nested.length && chunk) out.push(chunk); + else out.push(...nested); } - return chunks; + return out; } return chunkMarkdownText(text, limit); } diff --git a/src/infra/outbound/deliver.ts b/src/infra/outbound/deliver.ts index cd0ccd5ff..d246889e9 100644 --- a/src/infra/outbound/deliver.ts +++ b/src/infra/outbound/deliver.ts @@ -1,5 +1,5 @@ import { - chunkByNewline, + chunkByParagraph, chunkMarkdownTextWithMode, resolveChunkMode, resolveTextChunkLimit, @@ -239,14 +239,15 @@ export async function deliverOutboundPayloads(params: { } if (chunkMode === "newline") { const mode = handler.chunkerMode ?? "text"; - const lineChunks = + const blockChunks = mode === "markdown" ? chunkMarkdownTextWithMode(text, textLimit, "newline") - : chunkByNewline(text, textLimit, { splitLongLines: false }); - if (!lineChunks.length && text) lineChunks.push(text); - for (const lineChunk of lineChunks) { - const chunks = handler.chunker(lineChunk, textLimit); - if (!chunks.length && lineChunk) chunks.push(lineChunk); + : chunkByParagraph(text, textLimit); + + if (!blockChunks.length && text) blockChunks.push(text); + for (const blockChunk of blockChunks) { + const chunks = handler.chunker(blockChunk, textLimit); + if (!chunks.length && blockChunk) chunks.push(blockChunk); for (const chunk of chunks) { throwIfAborted(abortSignal); results.push(await handler.sendText(chunk));