fix: paragraph-aware newline chunking (#1726)

Thanks @tyler6204 Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
2026-01-25 13:24:00 +00:00
parent c3f5b4c416
commit 0130ecd800
17 changed files with 39 additions and 24 deletions
--- a/src/auto-reply/chunk.test.ts
+++ b/src/auto-reply/chunk.test.ts
@@ -344,6 +344,11 @@ describe("chunkMarkdownTextWithMode", () => {
    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
  });

+  it("defers long markdown paragraphs to markdown chunking in newline mode", () => {
+    const text = `\`\`\`js\n${"const a = 1;\n".repeat(20)}\`\`\``;
+    expect(chunkMarkdownTextWithMode(text, 40, "newline")).toEqual(chunkMarkdownText(text, 40));
+  });
+
  it("does not split on blank lines inside a fenced code block", () => {
    const text = "```python\ndef my_function():\n    x = 1\n\n    y = 2\n    return x + y\n```";
    expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
--- a/src/auto-reply/chunk.ts
+++ b/src/auto-reply/chunk.ts
@@ -173,10 +173,16 @@ export function chunkByNewline(
 * - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines)
 * - Packs multiple paragraphs into a single chunk up to `limit`
 * - Falls back to length-based splitting when a single paragraph exceeds `limit`
+ *   (unless `splitLongParagraphs` is disabled)
 */
-export function chunkByParagraph(text: string, limit: number): string[] {
+export function chunkByParagraph(
+  text: string,
+  limit: number,
+  opts?: { splitLongParagraphs?: boolean },
+): string[] {
  if (!text) return [];
  if (limit <= 0) return [text];
+  const splitLongParagraphs = opts?.splitLongParagraphs !== false;

  // Normalize to \n so blank line detection is consistent.
  const normalized = text.replace(/\r\n?/g, "\n");
@@ -186,7 +192,9 @@ export function chunkByParagraph(text: string, limit: number): string[] {
  // boundaries, not only exceeding a length limit.)
  const paragraphRe = /\n[\t ]*\n+/;
  if (!paragraphRe.test(normalized)) {
-    return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
+    if (normalized.length <= limit) return [normalized];
+    if (!splitLongParagraphs) return [normalized];
+    return chunkText(normalized, limit);
  }

  const spans = parseFenceSpans(normalized);
@@ -213,6 +221,8 @@ export function chunkByParagraph(text: string, limit: number): string[] {
    if (!paragraph.trim()) continue;
    if (paragraph.length <= limit) {
      chunks.push(paragraph);
+    } else if (!splitLongParagraphs) {
+      chunks.push(paragraph);
    } else {
      chunks.push(...chunkText(paragraph, limit));
    }
@@ -235,7 +245,7 @@ export function chunkMarkdownTextWithMode(text: string, limit: number, mode: Chu
  if (mode === "newline") {
    // Paragraph chunking is fence-safe because we never split at arbitrary indices.
    // If a paragraph must be split by length, defer to the markdown-aware chunker.
-    const paragraphChunks = chunkByParagraph(text, limit);
+    const paragraphChunks = chunkByParagraph(text, limit, { splitLongParagraphs: false });
    const out: string[] = [];
    for (const chunk of paragraphChunks) {
      const nested = chunkMarkdownText(chunk, limit);