Fix newline chunking: keep paragraphs/lists together

2026-01-24 22:06:22 -08:00
parent 22cf2b6766
commit 03e9a076b8
3 changed files with 115 additions and 43 deletions
--- a/src/auto-reply/chunk.test.ts
+++ b/src/auto-reply/chunk.test.ts
@@ -310,10 +310,16 @@ describe("chunkTextWithMode", () => {
    expect(chunks).toEqual(["Line one\nLine two"]);
  });
-  it("uses newline-based chunking for newline mode", () => {
+  it("uses paragraph-based chunking for newline mode", () => {
    const text = "Line one\nLine two";
    const chunks = chunkTextWithMode(text, 1000, "newline");
-    expect(chunks).toEqual(["Line one", "Line two"]);
+    expect(chunks).toEqual(["Line one\nLine two"]);
  });
  it("splits on blank lines for newline mode", () => {
    const text = "Para one\n\nPara two";
    const chunks = chunkTextWithMode(text, 1000, "newline");
    expect(chunks).toEqual(["Para one", "Para two"]);
  });
 });
@@ -323,17 +329,19 @@ describe("chunkMarkdownTextWithMode", () => {
    expect(chunkMarkdownTextWithMode(text, 1000, "length")).toEqual(chunkMarkdownText(text, 1000));
  });
-  it("uses newline-based chunking for newline mode", () => {
+  it("uses paragraph-based chunking for newline mode", () => {
    const text = "Line one\nLine two";
-    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one", "Line two"]);
+    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one\nLine two"]);
  });
-  it("does not split inside code fences for newline mode", () => {
+  it("splits on blank lines for newline mode", () => {
    const text = "Para one\n\nPara two";
    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Para one", "Para two"]);
  });
  it("does not split single-newline code fences in newline mode", () => {
    const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
-    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([
+    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
      "```js\nconst a = 1;\nconst b = 2;\n```",
      "After",
    ]);
  });
 });
--- a/src/auto-reply/chunk.ts
+++ b/src/auto-reply/chunk.ts
@@ -13,7 +13,9 @@ export type TextChunkProvider = ChannelId | typeof INTERNAL_MESSAGE_CHANNEL;
 /**
 * Chunking mode for outbound messages:
 * - "length": Split only when exceeding textChunkLimit (default)
- * - "newline": Split on every newline, with fallback to length-based for long lines
+ * - "newline": Prefer breaking on "soft" boundaries. Historically this split on every
 *   newline; now it only breaks on paragraph boundaries (blank lines) unless the text
 *   exceeds the length limit.
 */
 export type ChunkMode = "length" | "newline";
@@ -164,44 +166,105 @@ export function chunkByNewline(
  return chunks;
 }
 /**
 * Split text into chunks on paragraph boundaries (blank lines), preserving lists and
 * single-newline line wraps inside paragraphs.
 *
 * - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines)
 * - Packs multiple paragraphs into a single chunk up to `limit`
 * - Falls back to length-based splitting when a single paragraph exceeds `limit`
 */
 export function chunkByParagraph(text: string, limit: number): string[] {
  if (!text) return [];
  if (limit <= 0) return [text];
  if (text.length <= limit) return [text];
  // Normalize to \n so blank line detection is consistent.
  const normalized = text.replace(/\r\n?/g, "\n");
  const parts: string[] = [];
  const seps: string[] = [];
  const re = /\n[\t ]*\n+/g; // paragraph break: blank line(s), allowing whitespace
  let lastIndex = 0;
  for (const match of normalized.matchAll(re)) {
    const idx = match.index ?? 0;
    parts.push(normalized.slice(lastIndex, idx));
    seps.push(match[0]);
    lastIndex = idx + match[0].length;
  }
  parts.push(normalized.slice(lastIndex));
  const chunks: string[] = [];
  let current = "";
  let pendingSep = "";
  const flush = () => {
    const out = current.trimEnd();
    if (out) chunks.push(out);
    current = "";
  };
  for (let i = 0; i < parts.length; i++) {
    const paragraph = parts[i] ?? "";
    if (!paragraph.trim() && i === parts.length - 1) break;
    const prefix = pendingSep;
    pendingSep = seps[i] ?? "";
    const candidate = current
      ? `${current}${prefix}${paragraph}`
      : // Cap leading blank lines so we never exceed `limit` with just prefixes.
        `${prefix.slice(0, Math.max(0, limit - 1))}${paragraph}`;
    if (candidate.length <= limit) {
      current = candidate;
      continue;
    }
    // Can't fit this paragraph into the current chunk.
    if (current) flush();
    const paragraphWithPrefix = `${prefix}${paragraph}`;
    if (paragraphWithPrefix.length <= limit) {
      current = paragraphWithPrefix;
      continue;
    }
    // Paragraph itself is too long; split it by length (preferring newlines/whitespace).
    const prefixCap = prefix.slice(0, Math.max(0, limit - 1));
    const remainingLimit = Math.max(1, limit - prefixCap.length);
    const [first, ...rest] = chunkText(paragraph, remainingLimit);
    if (first) chunks.push(prefixCap + first);
    chunks.push(...rest);
  }
  if (current.trim()) flush();
  return chunks;
 }
 /**
 * Unified chunking function that dispatches based on mode.
 */
 export function chunkTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
  if (mode === "newline") {
-    const chunks: string[] = [];
+    return chunkByParagraph(text, limit);
    const lineChunks = chunkByNewline(text, limit, { splitLongLines: false });
    for (const line of lineChunks) {
      const nested = chunkText(line, limit);
      if (!nested.length && line) {
        chunks.push(line);
        continue;
      }
      chunks.push(...nested);
    }
    return chunks;
  }
  return chunkText(text, limit);
 }
 export function chunkMarkdownTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
  if (mode === "newline") {
-    const spans = parseFenceSpans(text);
+    // Paragraph chunking is fence-safe because we never split at arbitrary indices.
-    const chunks: string[] = [];
+    // If a paragraph must be split by length, defer to the markdown-aware chunker.
-    const lineChunks = chunkByNewline(text, limit, {
+    const paragraphChunks = chunkByParagraph(text, limit);
-      splitLongLines: false,
+    const out: string[] = [];
-      trimLines: false,
+    for (const chunk of paragraphChunks) {
-      isSafeBreak: (index) => isSafeFenceBreak(spans, index),
+      const nested = chunkMarkdownText(chunk, limit);
-    });
+      if (!nested.length && chunk) out.push(chunk);
-    for (const line of lineChunks) {
+      else out.push(...nested);
      const nested = chunkMarkdownText(line, limit);
      if (!nested.length && line) {
        chunks.push(line);
        continue;
      }
      chunks.push(...nested);
    }
-    return chunks;
+    return out;
  }
  return chunkMarkdownText(text, limit);
 }
--- a/src/infra/outbound/deliver.ts
+++ b/src/infra/outbound/deliver.ts
@@ -1,5 +1,5 @@
 import {
-  chunkByNewline,
+  chunkByParagraph,
  chunkMarkdownTextWithMode,
  resolveChunkMode,
  resolveTextChunkLimit,
@@ -239,14 +239,15 @@ export async function deliverOutboundPayloads(params: {
    }
    if (chunkMode === "newline") {
      const mode = handler.chunkerMode ?? "text";
-      const lineChunks =
+      const blockChunks =
        mode === "markdown"
          ? chunkMarkdownTextWithMode(text, textLimit, "newline")
-          : chunkByNewline(text, textLimit, { splitLongLines: false });
+          : chunkByParagraph(text, textLimit);
-      if (!lineChunks.length && text) lineChunks.push(text);
+
-      for (const lineChunk of lineChunks) {
+      if (!blockChunks.length && text) blockChunks.push(text);
-        const chunks = handler.chunker(lineChunk, textLimit);
+      for (const blockChunk of blockChunks) {
-        if (!chunks.length && lineChunk) chunks.push(lineChunk);
+        const chunks = handler.chunker(blockChunk, textLimit);
        if (!chunks.length && blockChunk) chunks.push(blockChunk);
        for (const chunk of chunks) {
          throwIfAborted(abortSignal);
          results.push(await handler.sendText(chunk));