Fix newline chunking: keep paragraphs/lists together

2026-01-24 22:06:22 -08:00
parent 22cf2b6766
commit 03e9a076b8
3 changed files with 115 additions and 43 deletions
--- a/src/auto-reply/chunk.test.ts
+++ b/src/auto-reply/chunk.test.ts
@@ -310,10 +310,16 @@ describe("chunkTextWithMode", () => {
    expect(chunks).toEqual(["Line one\nLine two"]);
  });

-  it("uses newline-based chunking for newline mode", () => {
+  it("uses paragraph-based chunking for newline mode", () => {
    const text = "Line one\nLine two";
    const chunks = chunkTextWithMode(text, 1000, "newline");
-    expect(chunks).toEqual(["Line one", "Line two"]);
+    expect(chunks).toEqual(["Line one\nLine two"]);
+  });
+
+  it("splits on blank lines for newline mode", () => {
+    const text = "Para one\n\nPara two";
+    const chunks = chunkTextWithMode(text, 1000, "newline");
+    expect(chunks).toEqual(["Para one", "Para two"]);
  });
 });

@@ -323,17 +329,19 @@ describe("chunkMarkdownTextWithMode", () => {
    expect(chunkMarkdownTextWithMode(text, 1000, "length")).toEqual(chunkMarkdownText(text, 1000));
  });

-  it("uses newline-based chunking for newline mode", () => {
+  it("uses paragraph-based chunking for newline mode", () => {
    const text = "Line one\nLine two";
-    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one", "Line two"]);
+    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one\nLine two"]);
  });

-  it("does not split inside code fences for newline mode", () => {
+  it("splits on blank lines for newline mode", () => {
+    const text = "Para one\n\nPara two";
+    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Para one", "Para two"]);
+  });
+
+  it("does not split single-newline code fences in newline mode", () => {
    const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
-    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([
-      "```js\nconst a = 1;\nconst b = 2;\n```",
-      "After",
-    ]);
+    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
  });
 });

--- a/src/auto-reply/chunk.ts
+++ b/src/auto-reply/chunk.ts
@@ -13,7 +13,9 @@ export type TextChunkProvider = ChannelId | typeof INTERNAL_MESSAGE_CHANNEL;
 /**
 * Chunking mode for outbound messages:
 * - "length": Split only when exceeding textChunkLimit (default)
- * - "newline": Split on every newline, with fallback to length-based for long lines
+ * - "newline": Prefer breaking on "soft" boundaries. Historically this split on every
+ *   newline; now it only breaks on paragraph boundaries (blank lines) unless the text
+ *   exceeds the length limit.
 */
 export type ChunkMode = "length" | "newline";

@@ -164,44 +166,105 @@ export function chunkByNewline(
  return chunks;
 }

+/**
+ * Split text into chunks on paragraph boundaries (blank lines), preserving lists and
+ * single-newline line wraps inside paragraphs.
+ *
+ * - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines)
+ * - Packs multiple paragraphs into a single chunk up to `limit`
+ * - Falls back to length-based splitting when a single paragraph exceeds `limit`
+ */
+export function chunkByParagraph(text: string, limit: number): string[] {
+  if (!text) return [];
+  if (limit <= 0) return [text];
+  if (text.length <= limit) return [text];
+
+  // Normalize to \n so blank line detection is consistent.
+  const normalized = text.replace(/\r\n?/g, "\n");
+
+  const parts: string[] = [];
+  const seps: string[] = [];
+
+  const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
+  let lastIndex = 0;
+  for (const match of normalized.matchAll(re)) {
+    const idx = match.index ?? 0;
+    parts.push(normalized.slice(lastIndex, idx));
+    seps.push(match[0]);
+    lastIndex = idx + match[0].length;
+  }
+  parts.push(normalized.slice(lastIndex));
+
+  const chunks: string[] = [];
+  let current = "";
+  let pendingSep = "";
+
+  const flush = () => {
+    const out = current.trimEnd();
+    if (out) chunks.push(out);
+    current = "";
+  };
+
+  for (let i = 0; i < parts.length; i++) {
+    const paragraph = parts[i] ?? "";
+    if (!paragraph.trim() && i === parts.length - 1) break;
+
+    const prefix = pendingSep;
+    pendingSep = seps[i] ?? "";
+
+    const candidate = current
+      ? `${current}${prefix}${paragraph}`
+      : // Cap leading blank lines so we never exceed `limit` with just prefixes.
+        `${prefix.slice(0, Math.max(0, limit - 1))}${paragraph}`;
+
+    if (candidate.length <= limit) {
+      current = candidate;
+      continue;
+    }
+
+    // Can't fit this paragraph into the current chunk.
+    if (current) flush();
+
+    const paragraphWithPrefix = `${prefix}${paragraph}`;
+    if (paragraphWithPrefix.length <= limit) {
+      current = paragraphWithPrefix;
+      continue;
+    }
+
+    // Paragraph itself is too long; split it by length (preferring newlines/whitespace).
+    const prefixCap = prefix.slice(0, Math.max(0, limit - 1));
+    const remainingLimit = Math.max(1, limit - prefixCap.length);
+    const [first, ...rest] = chunkText(paragraph, remainingLimit);
+    if (first) chunks.push(prefixCap + first);
+    chunks.push(...rest);
+  }
+
+  if (current.trim()) flush();
+  return chunks;
+}
+
 /**
 * Unified chunking function that dispatches based on mode.
 */
 export function chunkTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
  if (mode === "newline") {
-    const chunks: string[] = [];
-    const lineChunks = chunkByNewline(text, limit, { splitLongLines: false });
-    for (const line of lineChunks) {
-      const nested = chunkText(line, limit);
-      if (!nested.length && line) {
-        chunks.push(line);
-        continue;
-      }
-      chunks.push(...nested);
-    }
-    return chunks;
+    return chunkByParagraph(text, limit);
  }
  return chunkText(text, limit);
 }

 export function chunkMarkdownTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
  if (mode === "newline") {
-    const spans = parseFenceSpans(text);
-    const chunks: string[] = [];
-    const lineChunks = chunkByNewline(text, limit, {
-      splitLongLines: false,
-      trimLines: false,
-      isSafeBreak: (index) => isSafeFenceBreak(spans, index),
-    });
-    for (const line of lineChunks) {
-      const nested = chunkMarkdownText(line, limit);
-      if (!nested.length && line) {
-        chunks.push(line);
-        continue;
-      }
-      chunks.push(...nested);
+    // Paragraph chunking is fence-safe because we never split at arbitrary indices.
+    // If a paragraph must be split by length, defer to the markdown-aware chunker.
+    const paragraphChunks = chunkByParagraph(text, limit);
+    const out: string[] = [];
+    for (const chunk of paragraphChunks) {
+      const nested = chunkMarkdownText(chunk, limit);
+      if (!nested.length && chunk) out.push(chunk);
+      else out.push(...nested);
    }
-    return chunks;
+    return out;
  }
  return chunkMarkdownText(text, limit);
 }
--- a/src/infra/outbound/deliver.ts
+++ b/src/infra/outbound/deliver.ts
@@ -1,5 +1,5 @@
 import {
-  chunkByNewline,
+  chunkByParagraph,
  chunkMarkdownTextWithMode,
  resolveChunkMode,
  resolveTextChunkLimit,
@@ -239,14 +239,15 @@ export async function deliverOutboundPayloads(params: {
    }
    if (chunkMode === "newline") {
      const mode = handler.chunkerMode ?? "text";
-      const lineChunks =
+      const blockChunks =
        mode === "markdown"
          ? chunkMarkdownTextWithMode(text, textLimit, "newline")
-          : chunkByNewline(text, textLimit, { splitLongLines: false });
-      if (!lineChunks.length && text) lineChunks.push(text);
-      for (const lineChunk of lineChunks) {
-        const chunks = handler.chunker(lineChunk, textLimit);
-        if (!chunks.length && lineChunk) chunks.push(lineChunk);
+          : chunkByParagraph(text, textLimit);
+
+      if (!blockChunks.length && text) blockChunks.push(text);
+      for (const blockChunk of blockChunks) {
+        const chunks = handler.chunker(blockChunk, textLimit);
+        if (!chunks.length && blockChunk) chunks.push(blockChunk);
        for (const chunk of chunks) {
          throwIfAborted(abortSignal);
          results.push(await handler.sendText(chunk));