From 0130ecd8004bfe640446d0afd78d4cfe05942bf0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 25 Jan 2026 13:24:00 +0000 Subject: [PATCH] fix: paragraph-aware newline chunking (#1726) Thanks @tyler6204 Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com> --- CHANGELOG.md | 1 + docs/channels/bluebubbles.md | 2 +- docs/channels/discord.md | 4 ++-- docs/channels/imessage.md | 4 ++-- docs/channels/matrix.md | 2 +- docs/channels/msteams.md | 2 +- docs/channels/nextcloud-talk.md | 2 +- docs/channels/signal.md | 4 ++-- docs/channels/slack.md | 2 +- docs/channels/telegram.md | 4 ++-- docs/channels/whatsapp.md | 2 +- docs/concepts/streaming.md | 2 +- docs/gateway/configuration.md | 2 +- src/auto-reply/chunk.test.ts | 5 +++++ src/auto-reply/chunk.ts | 16 +++++++++++++--- src/discord/chunk.test.ts | 2 +- src/infra/outbound/deliver.test.ts | 7 +++---- 17 files changed, 39 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebff8d858..2bc3ebae3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,7 @@ Docs: https://docs.clawd.bot - Media understanding: skip image understanding when the primary model already supports vision. (#1747) Thanks @tyler6204. - Models: default missing custom provider fields so minimal configs are accepted. - Messaging: keep newline chunking safe for fenced markdown blocks across channels. +- Messaging: treat newline chunking as paragraph-aware (blank-line splits) to keep lists and headings together. (#1726) Thanks @tyler6204. - TUI: reload history after gateway reconnect to restore session state. (#1663) - Heartbeat: normalize target identifiers for consistent routing. - Exec: keep approvals for elevated ask unless full mode. (#1616) Thanks @ivancasco. diff --git a/docs/channels/bluebubbles.md b/docs/channels/bluebubbles.md index 1dd8e560d..a1f4a0892 100644 --- a/docs/channels/bluebubbles.md +++ b/docs/channels/bluebubbles.md @@ -196,7 +196,7 @@ Provider options: - `channels.bluebubbles.sendReadReceipts`: Send read receipts (default: `true`). - `channels.bluebubbles.blockStreaming`: Enable block streaming (default: `true`). - `channels.bluebubbles.textChunkLimit`: Outbound chunk size in chars (default: 4000). -- `channels.bluebubbles.chunkMode`: `length` (default) splits only when exceeding `textChunkLimit`; `newline` splits on every newline and sends each line immediately during streaming. +- `channels.bluebubbles.chunkMode`: `length` (default) splits only when exceeding `textChunkLimit`; `newline` splits on blank lines (paragraph boundaries) before length chunking. - `channels.bluebubbles.mediaMaxMb`: Inbound media cap in MB (default: 8). - `channels.bluebubbles.historyLimit`: Max group messages for context (0 disables). - `channels.bluebubbles.dmHistoryLimit`: DM history limit. diff --git a/docs/channels/discord.md b/docs/channels/discord.md index f63fd45c9..12dd28084 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -205,7 +205,7 @@ Notes: ## Capabilities & limits - DMs and guild text channels (threads are treated as separate channels; voice not supported). - Typing indicators sent best-effort; message chunking uses `channels.discord.textChunkLimit` (default 2000) and splits tall replies by line count (`channels.discord.maxLinesPerMessage`, default 17). -- Optional newline chunking: set `channels.discord.chunkMode="newline"` to split on each line before length chunking. +- Optional newline chunking: set `channels.discord.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. - File uploads supported up to the configured `channels.discord.mediaMaxMb` (default 8 MB). - Mention-gated guild replies by default to avoid noisy bots. - Reply context is injected when a message references another message (quoted content + ids). @@ -307,7 +307,7 @@ ack reaction after the bot replies. - `guilds..requireMention`: per-guild mention requirement (overridable per channel). - `guilds..reactionNotifications`: reaction system event mode (`off`, `own`, `all`, `allowlist`). - `textChunkLimit`: outbound text chunk size (chars). Default: 2000. -- `chunkMode`: `length` (default) splits only when exceeding `textChunkLimit`; `newline` splits on every newline before length chunking. +- `chunkMode`: `length` (default) splits only when exceeding `textChunkLimit`; `newline` splits on blank lines (paragraph boundaries) before length chunking. - `maxLinesPerMessage`: soft max line count per message. Default: 17. - `mediaMaxMb`: clamp inbound media saved to disk. - `historyLimit`: number of recent guild messages to include as context when replying to a mention (default 20; falls back to `messages.groupChat.historyLimit`; `0` disables). diff --git a/docs/channels/imessage.md b/docs/channels/imessage.md index 316822dc5..bae945e8c 100644 --- a/docs/channels/imessage.md +++ b/docs/channels/imessage.md @@ -219,7 +219,7 @@ This is useful when you want an isolated personality/model for a specific thread ## Limits - Outbound text is chunked to `channels.imessage.textChunkLimit` (default 4000). -- Optional newline chunking: set `channels.imessage.chunkMode="newline"` to split on each line before length chunking. +- Optional newline chunking: set `channels.imessage.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. - Media uploads are capped by `channels.imessage.mediaMaxMb` (default 16). ## Addressing / delivery targets @@ -254,7 +254,7 @@ Provider options: - `channels.imessage.includeAttachments`: ingest attachments into context. - `channels.imessage.mediaMaxMb`: inbound/outbound media cap (MB). - `channels.imessage.textChunkLimit`: outbound chunk size (chars). -- `channels.imessage.chunkMode`: `length` (default) or `newline` to split on newlines before length chunking. +- `channels.imessage.chunkMode`: `length` (default) or `newline` to split on blank lines (paragraph boundaries) before length chunking. Related global options: - `agents.list[].groupChat.mentionPatterns` (or `messages.groupChat.mentionPatterns`). diff --git a/docs/channels/matrix.md b/docs/channels/matrix.md index 77a2989d5..2d9025f51 100644 --- a/docs/channels/matrix.md +++ b/docs/channels/matrix.md @@ -215,7 +215,7 @@ Provider options: - `channels.matrix.initialSyncLimit`: initial sync limit. - `channels.matrix.threadReplies`: `off | inbound | always` (default: inbound). - `channels.matrix.textChunkLimit`: outbound text chunk size (chars). -- `channels.matrix.chunkMode`: `length` (default) or `newline` to split on newlines before length chunking. +- `channels.matrix.chunkMode`: `length` (default) or `newline` to split on blank lines (paragraph boundaries) before length chunking. - `channels.matrix.dm.policy`: `pairing | allowlist | open | disabled` (default: pairing). - `channels.matrix.dm.allowFrom`: DM allowlist (user IDs or display names). `open` requires `"*"`. The wizard resolves names to IDs when possible. - `channels.matrix.groupPolicy`: `allowlist | open | disabled` (default: allowlist). diff --git a/docs/channels/msteams.md b/docs/channels/msteams.md index de3b064b2..2f6ed5f83 100644 --- a/docs/channels/msteams.md +++ b/docs/channels/msteams.md @@ -415,7 +415,7 @@ Key settings (see `/gateway/configuration` for shared channel patterns): - `channels.msteams.dmPolicy`: `pairing | allowlist | open | disabled` (default: pairing) - `channels.msteams.allowFrom`: allowlist for DMs (AAD object IDs, UPNs, or display names). The wizard resolves names to IDs during setup when Graph access is available. - `channels.msteams.textChunkLimit`: outbound text chunk size. -- `channels.msteams.chunkMode`: `length` (default) or `newline` to split on newlines before length chunking. +- `channels.msteams.chunkMode`: `length` (default) or `newline` to split on blank lines (paragraph boundaries) before length chunking. - `channels.msteams.mediaAllowHosts`: allowlist for inbound attachment hosts (defaults to Microsoft/Teams domains). - `channels.msteams.requireMention`: require @mention in channels/groups (default true). - `channels.msteams.replyStyle`: `thread | top-level` (see [Reply Style](#reply-style-threads-vs-posts)). diff --git a/docs/channels/nextcloud-talk.md b/docs/channels/nextcloud-talk.md index 43c1595ed..abc696444 100644 --- a/docs/channels/nextcloud-talk.md +++ b/docs/channels/nextcloud-talk.md @@ -114,7 +114,7 @@ Provider options: - `channels.nextcloud-talk.dmHistoryLimit`: DM history limit (0 disables). - `channels.nextcloud-talk.dms`: per-DM overrides (historyLimit). - `channels.nextcloud-talk.textChunkLimit`: outbound text chunk size (chars). -- `channels.nextcloud-talk.chunkMode`: `length` (default) or `newline` to split on newlines before length chunking. +- `channels.nextcloud-talk.chunkMode`: `length` (default) or `newline` to split on blank lines (paragraph boundaries) before length chunking. - `channels.nextcloud-talk.blockStreaming`: disable block streaming for this channel. - `channels.nextcloud-talk.blockStreamingCoalesce`: block streaming coalesce tuning. - `channels.nextcloud-talk.mediaMaxMb`: inbound media cap (MB). diff --git a/docs/channels/signal.md b/docs/channels/signal.md index 0ba89385d..c154b0591 100644 --- a/docs/channels/signal.md +++ b/docs/channels/signal.md @@ -111,7 +111,7 @@ Groups: ## Media + limits - Outbound text is chunked to `channels.signal.textChunkLimit` (default 4000). -- Optional newline chunking: set `channels.signal.chunkMode="newline"` to split on each line before length chunking. +- Optional newline chunking: set `channels.signal.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. - Attachments supported (base64 fetched from `signal-cli`). - Default media cap: `channels.signal.mediaMaxMb` (default 8). - Use `channels.signal.ignoreAttachments` to skip downloading media. @@ -170,7 +170,7 @@ Provider options: - `channels.signal.historyLimit`: max group messages to include as context (0 disables). - `channels.signal.dmHistoryLimit`: DM history limit in user turns. Per-user overrides: `channels.signal.dms[""].historyLimit`. - `channels.signal.textChunkLimit`: outbound chunk size (chars). -- `channels.signal.chunkMode`: `length` (default) or `newline` to split on newlines before length chunking. +- `channels.signal.chunkMode`: `length` (default) or `newline` to split on blank lines (paragraph boundaries) before length chunking. - `channels.signal.mediaMaxMb`: inbound/outbound media cap (MB). Related global options: diff --git a/docs/channels/slack.md b/docs/channels/slack.md index 44bc84035..5f768db0e 100644 --- a/docs/channels/slack.md +++ b/docs/channels/slack.md @@ -349,7 +349,7 @@ ack reaction after the bot replies. ## Limits - Outbound text is chunked to `channels.slack.textChunkLimit` (default 4000). -- Optional newline chunking: set `channels.slack.chunkMode="newline"` to split on each line before length chunking. +- Optional newline chunking: set `channels.slack.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. - Media uploads are capped by `channels.slack.mediaMaxMb` (default 20). ## Reply threading diff --git a/docs/channels/telegram.md b/docs/channels/telegram.md index eb558cf74..e708e2e64 100644 --- a/docs/channels/telegram.md +++ b/docs/channels/telegram.md @@ -135,7 +135,7 @@ Notes: ## Limits - Outbound text is chunked to `channels.telegram.textChunkLimit` (default 4000). -- Optional newline chunking: set `channels.telegram.chunkMode="newline"` to split on each line before length chunking. +- Optional newline chunking: set `channels.telegram.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. - Media downloads/uploads are capped by `channels.telegram.mediaMaxMb` (default 5). - Telegram Bot API requests time out after `channels.telegram.timeoutSeconds` (default 500 via grammY). Set lower to avoid long hangs. - Group history context uses `channels.telegram.historyLimit` (or `channels.telegram.accounts.*.historyLimit`), falling back to `messages.groupChat.historyLimit`. Set `0` to disable (default 50). @@ -524,7 +524,7 @@ Provider options: - `channels.telegram.accounts..capabilities.inlineButtons`: per-account override. - `channels.telegram.replyToMode`: `off | first | all` (default: `first`). - `channels.telegram.textChunkLimit`: outbound chunk size (chars). -- `channels.telegram.chunkMode`: `length` (default) or `newline` to split on newlines before length chunking. +- `channels.telegram.chunkMode`: `length` (default) or `newline` to split on blank lines (paragraph boundaries) before length chunking. - `channels.telegram.linkPreview`: toggle link previews for outbound messages (default: true). - `channels.telegram.streamMode`: `off | partial | block` (draft streaming). - `channels.telegram.mediaMaxMb`: inbound/outbound media cap (MB). diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index 517c71b93..4759cf4c9 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -271,7 +271,7 @@ WhatsApp can automatically send emoji reactions to incoming messages immediately ## Limits - Outbound text is chunked to `channels.whatsapp.textChunkLimit` (default 4000). -- Optional newline chunking: set `channels.whatsapp.chunkMode="newline"` to split on each line before length chunking. +- Optional newline chunking: set `channels.whatsapp.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. - Inbound media saves are capped by `channels.whatsapp.mediaMaxMb` (default 50 MB). - Outbound media items are capped by `agents.defaults.mediaMaxMb` (default 5 MB). diff --git a/docs/concepts/streaming.md b/docs/concepts/streaming.md index 8019e4cca..6f9609ca6 100644 --- a/docs/concepts/streaming.md +++ b/docs/concepts/streaming.md @@ -38,7 +38,7 @@ Legend: - `agents.defaults.blockStreamingChunk`: `{ minChars, maxChars, breakPreference? }`. - `agents.defaults.blockStreamingCoalesce`: `{ minChars?, maxChars?, idleMs? }` (merge streamed blocks before send). - Channel hard cap: `*.textChunkLimit` (e.g., `channels.whatsapp.textChunkLimit`). -- Channel chunk mode: `*.chunkMode` (`length` default, `newline` splits on each line before length chunking). +- Channel chunk mode: `*.chunkMode` (`length` default, `newline` splits on blank lines (paragraph boundaries) before length chunking). - Discord soft cap: `channels.discord.maxLinesPerMessage` (default 17) splits tall replies to avoid UI clipping. **Boundary semantics:** diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 3b16be5b1..868126101 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1131,7 +1131,7 @@ Reaction notification modes: - `own`: reactions on the bot's own messages (default). - `all`: all reactions on all messages. - `allowlist`: reactions from `guilds..users` on all messages (empty list disables). -Outbound text is chunked by `channels.discord.textChunkLimit` (default 2000). Set `channels.discord.chunkMode="newline"` to split on line boundaries before length chunking. Discord clients can clip very tall messages, so `channels.discord.maxLinesPerMessage` (default 17) splits long multi-line replies even when under 2000 chars. +Outbound text is chunked by `channels.discord.textChunkLimit` (default 2000). Set `channels.discord.chunkMode="newline"` to split on blank lines (paragraph boundaries) before length chunking. Discord clients can clip very tall messages, so `channels.discord.maxLinesPerMessage` (default 17) splits long multi-line replies even when under 2000 chars. Retry policy defaults and behavior are documented in [Retry policy](/concepts/retry). ### `channels.googlechat` (Chat API webhook) diff --git a/src/auto-reply/chunk.test.ts b/src/auto-reply/chunk.test.ts index 7007e0abc..545899843 100644 --- a/src/auto-reply/chunk.test.ts +++ b/src/auto-reply/chunk.test.ts @@ -344,6 +344,11 @@ describe("chunkMarkdownTextWithMode", () => { expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]); }); + it("defers long markdown paragraphs to markdown chunking in newline mode", () => { + const text = `\`\`\`js\n${"const a = 1;\n".repeat(20)}\`\`\``; + expect(chunkMarkdownTextWithMode(text, 40, "newline")).toEqual(chunkMarkdownText(text, 40)); + }); + it("does not split on blank lines inside a fenced code block", () => { const text = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```"; expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]); diff --git a/src/auto-reply/chunk.ts b/src/auto-reply/chunk.ts index 1615699b9..c4fd31ed8 100644 --- a/src/auto-reply/chunk.ts +++ b/src/auto-reply/chunk.ts @@ -173,10 +173,16 @@ export function chunkByNewline( * - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines) * - Packs multiple paragraphs into a single chunk up to `limit` * - Falls back to length-based splitting when a single paragraph exceeds `limit` + * (unless `splitLongParagraphs` is disabled) */ -export function chunkByParagraph(text: string, limit: number): string[] { +export function chunkByParagraph( + text: string, + limit: number, + opts?: { splitLongParagraphs?: boolean }, +): string[] { if (!text) return []; if (limit <= 0) return [text]; + const splitLongParagraphs = opts?.splitLongParagraphs !== false; // Normalize to \n so blank line detection is consistent. const normalized = text.replace(/\r\n?/g, "\n"); @@ -186,7 +192,9 @@ export function chunkByParagraph(text: string, limit: number): string[] { // boundaries, not only exceeding a length limit.) const paragraphRe = /\n[\t ]*\n+/; if (!paragraphRe.test(normalized)) { - return normalized.length <= limit ? [normalized] : chunkText(normalized, limit); + if (normalized.length <= limit) return [normalized]; + if (!splitLongParagraphs) return [normalized]; + return chunkText(normalized, limit); } const spans = parseFenceSpans(normalized); @@ -213,6 +221,8 @@ export function chunkByParagraph(text: string, limit: number): string[] { if (!paragraph.trim()) continue; if (paragraph.length <= limit) { chunks.push(paragraph); + } else if (!splitLongParagraphs) { + chunks.push(paragraph); } else { chunks.push(...chunkText(paragraph, limit)); } @@ -235,7 +245,7 @@ export function chunkMarkdownTextWithMode(text: string, limit: number, mode: Chu if (mode === "newline") { // Paragraph chunking is fence-safe because we never split at arbitrary indices. // If a paragraph must be split by length, defer to the markdown-aware chunker. - const paragraphChunks = chunkByParagraph(text, limit); + const paragraphChunks = chunkByParagraph(text, limit, { splitLongParagraphs: false }); const out: string[] = []; for (const chunk of paragraphChunks) { const nested = chunkMarkdownText(chunk, limit); diff --git a/src/discord/chunk.test.ts b/src/discord/chunk.test.ts index f8e18e2b4..13ec1b8e7 100644 --- a/src/discord/chunk.test.ts +++ b/src/discord/chunk.test.ts @@ -58,7 +58,7 @@ describe("chunkDiscordText", () => { maxLines: 50, chunkMode: "newline", }); - expect(chunks).toEqual(["```js\nconst a = 1;\nconst b = 2;\n```", "After"]); + expect(chunks).toEqual([text]); }); it("reserves space for closing fences when chunking", () => { diff --git a/src/infra/outbound/deliver.test.ts b/src/infra/outbound/deliver.test.ts index d259366b4..a80a3f482 100644 --- a/src/infra/outbound/deliver.test.ts +++ b/src/infra/outbound/deliver.test.ts @@ -192,7 +192,7 @@ describe("deliverOutboundPayloads", () => { expect(sendWhatsApp).toHaveBeenNthCalledWith( 2, "+1555", - "\nLine two", + "Line two", expect.objectContaining({ verbose: false }), ); }); @@ -241,9 +241,8 @@ describe("deliverOutboundPayloads", () => { payloads: [{ text }], }); - expect(chunker).toHaveBeenCalledTimes(2); - expect(chunker).toHaveBeenNthCalledWith(1, "```js\nconst a = 1;\nconst b = 2;\n```", 4000); - expect(chunker).toHaveBeenNthCalledWith(2, "After", 4000); + expect(chunker).toHaveBeenCalledTimes(1); + expect(chunker).toHaveBeenNthCalledWith(1, text, 4000); }); it("uses iMessage media maxBytes from agent fallback", async () => {