fix: preserve markdown fences when chunking

This commit is contained in:
Peter Steinberger
2026-01-06 20:19:56 +01:00
parent afc42c7547
commit 67bda21811
13 changed files with 378 additions and 108 deletions

View File

@@ -101,6 +101,7 @@
- Telegram: notify users when inbound media exceeds size limits. Thanks @jarvis-medmatic for PR #283.
- Telegram: send GIF media as animations (auto-play) and improve filename sniffing.
- Bash tool: inherit gateway PATH so Nix-provided tools resolve during commands. Thanks @joshp123 for PR #202.
- Delivery chunking: keep Markdown fenced code blocks valid when splitting long replies (close + reopen fences).
### Maintenance
- Agent: add `skipBootstrap` config option. Thanks @onutc for PR #292.

View File

@@ -1,17 +1,15 @@
import {
findFenceSpanAt,
isSafeFenceBreak,
parseFenceSpans,
} from "../markdown/fences.js";
export type BlockReplyChunking = {
minChars: number;
maxChars: number;
breakPreference?: "paragraph" | "newline" | "sentence";
};
type FenceSpan = {
start: number;
end: number;
openLine: string;
marker: string;
indent: string;
};
type FenceSplit = {
closeFenceLine: string;
reopenFenceLine: string;
@@ -123,7 +121,10 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph") {
let paragraphIdx = buffer.indexOf("\n\n");
while (paragraphIdx !== -1) {
if (paragraphIdx >= minChars && isSafeBreak(fenceSpans, paragraphIdx)) {
if (
paragraphIdx >= minChars &&
isSafeFenceBreak(fenceSpans, paragraphIdx)
) {
return { index: paragraphIdx };
}
paragraphIdx = buffer.indexOf("\n\n", paragraphIdx + 2);
@@ -133,7 +134,10 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph" || preference === "newline") {
let newlineIdx = buffer.indexOf("\n");
while (newlineIdx !== -1) {
if (newlineIdx >= minChars && isSafeBreak(fenceSpans, newlineIdx)) {
if (
newlineIdx >= minChars &&
isSafeFenceBreak(fenceSpans, newlineIdx)
) {
return { index: newlineIdx };
}
newlineIdx = buffer.indexOf("\n", newlineIdx + 1);
@@ -147,7 +151,7 @@ export class EmbeddedBlockChunker {
const at = match.index ?? -1;
if (at < minChars) continue;
const candidate = at + 1;
if (isSafeBreak(fenceSpans, candidate)) {
if (isSafeFenceBreak(fenceSpans, candidate)) {
sentenceIdx = candidate;
}
}
@@ -168,7 +172,7 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph") {
let paragraphIdx = window.lastIndexOf("\n\n");
while (paragraphIdx >= minChars) {
if (isSafeBreak(fenceSpans, paragraphIdx)) {
if (isSafeFenceBreak(fenceSpans, paragraphIdx)) {
return { index: paragraphIdx };
}
paragraphIdx = window.lastIndexOf("\n\n", paragraphIdx - 1);
@@ -178,7 +182,7 @@ export class EmbeddedBlockChunker {
if (preference === "paragraph" || preference === "newline") {
let newlineIdx = window.lastIndexOf("\n");
while (newlineIdx >= minChars) {
if (isSafeBreak(fenceSpans, newlineIdx)) {
if (isSafeFenceBreak(fenceSpans, newlineIdx)) {
return { index: newlineIdx };
}
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
@@ -192,7 +196,7 @@ export class EmbeddedBlockChunker {
const at = match.index ?? -1;
if (at < minChars) continue;
const candidate = at + 1;
if (isSafeBreak(fenceSpans, candidate)) {
if (isSafeFenceBreak(fenceSpans, candidate)) {
sentenceIdx = candidate;
}
}
@@ -200,13 +204,13 @@ export class EmbeddedBlockChunker {
}
for (let i = window.length - 1; i >= minChars; i--) {
if (/\s/.test(window[i]) && isSafeBreak(fenceSpans, i)) {
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
return { index: i };
}
}
if (buffer.length >= maxChars) {
if (isSafeBreak(fenceSpans, maxChars)) return { index: maxChars };
if (isSafeFenceBreak(fenceSpans, maxChars)) return { index: maxChars };
const fence = findFenceSpanAt(fenceSpans, maxChars);
if (fence) {
return {
@@ -229,76 +233,3 @@ function stripLeadingNewlines(value: string): string {
while (i < value.length && value[i] === "\n") i++;
return i > 0 ? value.slice(i) : value;
}
function parseFenceSpans(buffer: string): FenceSpan[] {
const spans: FenceSpan[] = [];
let open:
| {
start: number;
markerChar: string;
markerLen: number;
openLine: string;
marker: string;
indent: string;
}
| undefined;
let offset = 0;
while (offset <= buffer.length) {
const nextNewline = buffer.indexOf("\n", offset);
const lineEnd = nextNewline === -1 ? buffer.length : nextNewline;
const line = buffer.slice(offset, lineEnd);
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
if (match) {
const indent = match[1];
const marker = match[2];
const markerChar = marker[0];
const markerLen = marker.length;
if (!open) {
open = {
start: offset,
markerChar,
markerLen,
openLine: line,
marker,
indent,
};
} else if (
open.markerChar === markerChar &&
markerLen >= open.markerLen
) {
const end = nextNewline === -1 ? buffer.length : nextNewline + 1;
spans.push({
start: open.start,
end,
openLine: open.openLine,
marker: open.marker,
indent: open.indent,
});
open = undefined;
}
}
if (nextNewline === -1) break;
offset = nextNewline + 1;
}
if (open) {
spans.push({
start: open.start,
end: buffer.length,
openLine: open.openLine,
marker: open.marker,
indent: open.indent,
});
}
return spans;
}
function findFenceSpanAt(
spans: FenceSpan[],
index: number,
): FenceSpan | undefined {
return spans.find((span) => index > span.start && index < span.end);
}
function isSafeBreak(spans: FenceSpan[], index: number): boolean {
return !findFenceSpanAt(spans, index);
}

View File

@@ -1,6 +1,29 @@
import { describe, expect, it } from "vitest";
import { chunkText, resolveTextChunkLimit } from "./chunk.js";
import {
chunkMarkdownText,
chunkText,
resolveTextChunkLimit,
} from "./chunk.js";
function expectFencesBalanced(chunks: string[]) {
for (const chunk of chunks) {
let open: { markerChar: string; markerLen: number } | null = null;
for (const line of chunk.split("\n")) {
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
if (!match) continue;
const marker = match[2];
if (!open) {
open = { markerChar: marker[0], markerLen: marker.length };
continue;
}
if (open.markerChar === marker[0] && marker.length >= open.markerLen) {
open = null;
}
}
expect(open).toBe(null);
}
}
describe("chunkText", () => {
it("keeps multi-line text in one chunk when under limit", () => {
@@ -72,3 +95,79 @@ describe("resolveTextChunkLimit", () => {
expect(resolveTextChunkLimit(cfg, "telegram")).toBe(4000);
});
});
describe("chunkMarkdownText", () => {
it("keeps fenced blocks intact when a safe break exists", () => {
const prefix = "p".repeat(60);
const fence = "```bash\nline1\nline2\n```";
const suffix = "s".repeat(60);
const text = `${prefix}\n\n${fence}\n\n${suffix}`;
const chunks = chunkMarkdownText(text, 40);
expect(chunks.some((chunk) => chunk.trimEnd() === fence)).toBe(true);
expectFencesBalanced(chunks);
});
it("reopens fenced blocks when forced to split inside them", () => {
const text = `\`\`\`txt\n${"a".repeat(500)}\n\`\`\``;
const limit = 120;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("```txt\n")).toBe(true);
expect(chunk.trimEnd().endsWith("```")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("supports tilde fences", () => {
const text = `~~~sh\n${"x".repeat(600)}\n~~~`;
const limit = 140;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("~~~sh\n")).toBe(true);
expect(chunk.trimEnd().endsWith("~~~")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("supports longer fence markers for close", () => {
const text = `\`\`\`\`md\n${"y".repeat(600)}\n\`\`\`\``;
const limit = 140;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("````md\n")).toBe(true);
expect(chunk.trimEnd().endsWith("````")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("preserves indentation for indented fences", () => {
const text = ` \`\`\`js\n ${"z".repeat(600)}\n \`\`\``;
const limit = 160;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith(" ```js\n")).toBe(true);
expect(chunk.trimEnd().endsWith(" ```")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("never produces an empty fenced chunk when splitting", () => {
const text = `\`\`\`txt\n${"a".repeat(300)}\n\`\`\``;
const chunks = chunkMarkdownText(text, 60);
for (const chunk of chunks) {
const nonFenceLines = chunk
.split("\n")
.filter((line) => !/^( {0,3})(`{3,}|~{3,})(.*)$/.test(line));
expect(nonFenceLines.join("\n").trim()).not.toBe("");
}
});
});

View File

@@ -3,6 +3,11 @@
// the chunk so messages are only split when they truly exceed the limit.
import type { ClawdbotConfig } from "../config/config.js";
import {
findFenceSpanAt,
isSafeFenceBreak,
parseFenceSpans,
} from "../markdown/fences.js";
export type TextChunkProvider =
| "whatsapp"
@@ -91,3 +96,123 @@ export function chunkText(text: string, limit: number): string[] {
return chunks;
}
export function chunkMarkdownText(text: string, limit: number): string[] {
if (!text) return [];
if (limit <= 0) return [text];
if (text.length <= limit) return [text];
const chunks: string[] = [];
let remaining = text;
while (remaining.length > limit) {
const spans = parseFenceSpans(remaining);
const window = remaining.slice(0, limit);
const softBreak = pickSafeBreakIndex(window, spans);
let breakIdx = softBreak > 0 ? softBreak : limit;
const initialFence = isSafeFenceBreak(spans, breakIdx)
? undefined
: findFenceSpanAt(spans, breakIdx);
let fenceToSplit = initialFence;
if (initialFence) {
const closeLine = `${initialFence.indent}${initialFence.marker}`;
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
if (maxIdxIfNeedNewline <= 0) {
fenceToSplit = undefined;
breakIdx = limit;
} else {
const minProgressIdx = Math.min(
remaining.length,
initialFence.start + initialFence.openLine.length + 2,
);
const maxIdxIfAlreadyNewline = limit - closeLine.length;
let pickedNewline = false;
let lastNewline = remaining.lastIndexOf(
"\n",
Math.max(0, maxIdxIfAlreadyNewline - 1),
);
while (lastNewline !== -1) {
const candidateBreak = lastNewline + 1;
if (candidateBreak < minProgressIdx) break;
const candidateFence = findFenceSpanAt(spans, candidateBreak);
if (candidateFence && candidateFence.start === initialFence.start) {
breakIdx = Math.max(1, candidateBreak);
pickedNewline = true;
break;
}
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
}
if (!pickedNewline) {
if (minProgressIdx > maxIdxIfAlreadyNewline) {
fenceToSplit = undefined;
breakIdx = limit;
} else {
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
}
}
}
const fenceAtBreak = findFenceSpanAt(spans, breakIdx);
fenceToSplit =
fenceAtBreak && fenceAtBreak.start === initialFence.start
? fenceAtBreak
: undefined;
}
let rawChunk = remaining.slice(0, breakIdx);
if (!rawChunk) break;
const brokeOnSeparator =
breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
const nextStart = Math.min(
remaining.length,
breakIdx + (brokeOnSeparator ? 1 : 0),
);
let next = remaining.slice(nextStart);
if (fenceToSplit) {
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
rawChunk = rawChunk.endsWith("\n")
? `${rawChunk}${closeLine}`
: `${rawChunk}\n${closeLine}`;
next = `${fenceToSplit.openLine}\n${next}`;
} else {
next = stripLeadingNewlines(next);
}
chunks.push(rawChunk);
remaining = next;
}
if (remaining.length) chunks.push(remaining);
return chunks;
}
function stripLeadingNewlines(value: string): string {
let i = 0;
while (i < value.length && value[i] === "\n") i++;
return i > 0 ? value.slice(i) : value;
}
function pickSafeBreakIndex(
window: string,
spans: ReturnType<typeof parseFenceSpans>,
): number {
let newlineIdx = window.lastIndexOf("\n");
while (newlineIdx > 0) {
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx;
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
}
for (let i = window.length - 1; i > 0; i--) {
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i;
}
return -1;
}

View File

@@ -22,7 +22,11 @@ import {
DEFAULT_AGENT_WORKSPACE_DIR,
ensureAgentWorkspace,
} from "../agents/workspace.js";
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
chunkText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import type { MsgContext } from "../auto-reply/templating.js";
import {
normalizeThinkLevel,
@@ -667,7 +671,7 @@ export async function agentCommand(
if (deliveryProvider === "telegram" && telegramTarget) {
try {
if (media.length === 0) {
for (const chunk of chunkText(text, deliveryTextLimit)) {
for (const chunk of chunkMarkdownText(text, deliveryTextLimit)) {
await deps.sendMessageTelegram(telegramTarget, chunk, {
verbose: false,
token: telegramToken || undefined,

View File

@@ -19,7 +19,11 @@ import {
DEFAULT_AGENT_WORKSPACE_DIR,
ensureAgentWorkspace,
} from "../agents/workspace.js";
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
chunkText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import {
DEFAULT_HEARTBEAT_ACK_MAX_CHARS,
stripHeartbeatToken,
@@ -439,7 +443,10 @@ export async function runCronIsolatedAgentTurn(params: {
const mediaList =
payload.mediaUrls ?? (payload.mediaUrl ? [payload.mediaUrl] : []);
if (mediaList.length === 0) {
for (const chunk of chunkText(payload.text ?? "", textLimit)) {
for (const chunk of chunkMarkdownText(
payload.text ?? "",
textLimit,
)) {
await params.deps.sendMessageTelegram(chatId, chunk, {
verbose: false,
token: telegramToken || undefined,
@@ -528,7 +535,10 @@ export async function runCronIsolatedAgentTurn(params: {
const mediaList =
payload.mediaUrls ?? (payload.mediaUrl ? [payload.mediaUrl] : []);
if (mediaList.length === 0) {
for (const chunk of chunkText(payload.text ?? "", textLimit)) {
for (const chunk of chunkMarkdownText(
payload.text ?? "",
textLimit,
)) {
await params.deps.sendMessageSlack(slackTarget, chunk);
}
} else {

View File

@@ -15,7 +15,10 @@ import {
type PartialUser,
type User,
} from "discord.js";
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import { hasControlCommand } from "../auto-reply/command-detection.js";
import { formatAgentEnvelope } from "../auto-reply/envelope.js";
import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
@@ -1295,7 +1298,7 @@ async function deliverReplies({
const replyToId = payload.replyToId;
if (!text && mediaList.length === 0) continue;
if (mediaList.length === 0) {
for (const chunk of chunkText(text, chunkLimit)) {
for (const chunk of chunkMarkdownText(text, chunkLimit)) {
const replyTo = resolveDiscordReplyTarget({
replyToMode,
replyToId,

View File

@@ -12,7 +12,7 @@ import type {
RESTPostAPIGuildScheduledEventJSONBody,
} from "discord-api-types/v10";
import { chunkText } from "../auto-reply/chunk.js";
import { chunkMarkdownText } from "../auto-reply/chunk.js";
import { loadConfig } from "../config/config.js";
import {
normalizePollDurationHours,
@@ -360,7 +360,7 @@ async function sendDiscordText(
})) as { id: string; channel_id: string };
return res;
}
const chunks = chunkText(text, DISCORD_TEXT_LIMIT);
const chunks = chunkMarkdownText(text, DISCORD_TEXT_LIMIT);
let last: { id: string; channel_id: string } | null = null;
let isFirst = true;
for (const chunk of chunks) {

85
src/markdown/fences.ts Normal file
View File

@@ -0,0 +1,85 @@
export type FenceSpan = {
start: number;
end: number;
openLine: string;
marker: string;
indent: string;
};
export function parseFenceSpans(buffer: string): FenceSpan[] {
const spans: FenceSpan[] = [];
let open:
| {
start: number;
markerChar: string;
markerLen: number;
openLine: string;
marker: string;
indent: string;
}
| undefined;
let offset = 0;
while (offset <= buffer.length) {
const nextNewline = buffer.indexOf("\n", offset);
const lineEnd = nextNewline === -1 ? buffer.length : nextNewline;
const line = buffer.slice(offset, lineEnd);
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
if (match) {
const indent = match[1];
const marker = match[2];
const markerChar = marker[0];
const markerLen = marker.length;
if (!open) {
open = {
start: offset,
markerChar,
markerLen,
openLine: line,
marker,
indent,
};
} else if (
open.markerChar === markerChar &&
markerLen >= open.markerLen
) {
const end = nextNewline === -1 ? buffer.length : nextNewline + 1;
spans.push({
start: open.start,
end,
openLine: open.openLine,
marker: open.marker,
indent: open.indent,
});
open = undefined;
}
}
if (nextNewline === -1) break;
offset = nextNewline + 1;
}
if (open) {
spans.push({
start: open.start,
end: buffer.length,
openLine: open.openLine,
marker: open.marker,
indent: open.indent,
});
}
return spans;
}
export function findFenceSpanAt(
spans: FenceSpan[],
index: number,
): FenceSpan | undefined {
return spans.find((span) => index > span.start && index < span.end);
}
export function isSafeFenceBreak(spans: FenceSpan[], index: number): boolean {
return !findFenceSpanAt(spans, index);
}

View File

@@ -3,7 +3,10 @@ import {
type SlackCommandMiddlewareArgs,
type SlackEventMiddlewareArgs,
} from "@slack/bolt";
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import { hasControlCommand } from "../auto-reply/command-detection.js";
import { formatAgentEnvelope } from "../auto-reply/envelope.js";
import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
@@ -1525,7 +1528,7 @@ async function deliverReplies(params: {
if (!text && mediaList.length === 0) continue;
if (mediaList.length === 0) {
for (const chunk of chunkText(text, chunkLimit)) {
for (const chunk of chunkMarkdownText(text, chunkLimit)) {
const trimmed = chunk.trim();
if (!trimmed || trimmed === SILENT_REPLY_TOKEN) continue;
await sendMessageSlack(params.target, trimmed, {
@@ -1587,7 +1590,7 @@ async function deliverSlackSlashReplies(params: {
.filter(Boolean)
.join("\n");
if (!combined) continue;
for (const chunk of chunkText(combined, chunkLimit)) {
for (const chunk of chunkMarkdownText(combined, chunkLimit)) {
messages.push(chunk);
}
}

View File

@@ -1,6 +1,9 @@
import { type FilesUploadV2Arguments, WebClient } from "@slack/web-api";
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import { loadConfig } from "../config/config.js";
import { loadWebMedia } from "../web/media.js";
import { resolveSlackBotToken } from "./token.js";
@@ -144,7 +147,7 @@ export async function sendMessageSlack(
const cfg = loadConfig();
const textLimit = resolveTextChunkLimit(cfg, "slack");
const chunkLimit = Math.min(textLimit, SLACK_TEXT_LIMIT);
const chunks = chunkText(trimmedMessage, chunkLimit);
const chunks = chunkMarkdownText(trimmedMessage, chunkLimit);
const mediaMaxBytes =
typeof cfg.slack?.mediaMaxMb === "number"
? cfg.slack.mediaMaxMb * 1024 * 1024

View File

@@ -4,7 +4,10 @@ import { Buffer } from "node:buffer";
import { apiThrottler } from "@grammyjs/transformer-throttler";
import type { ApiClientOptions, Message } from "grammy";
import { Bot, InputFile, webhookCallback } from "grammy";
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import { hasControlCommand } from "../auto-reply/command-detection.js";
import { formatAgentEnvelope } from "../auto-reply/envelope.js";
import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
@@ -667,7 +670,7 @@ async function deliverReplies(params: {
? [reply.mediaUrl]
: [];
if (mediaList.length === 0) {
for (const chunk of chunkText(reply.text || "", textLimit)) {
for (const chunk of chunkMarkdownText(reply.text || "", textLimit)) {
await sendTelegramText(bot, chatId, chunk, runtime, {
replyToMessageId:
replyToId && (replyToMode === "all" || !hasReplied)

View File

@@ -1,4 +1,7 @@
import { chunkText, resolveTextChunkLimit } from "../auto-reply/chunk.js";
import {
chunkMarkdownText,
resolveTextChunkLimit,
} from "../auto-reply/chunk.js";
import { formatAgentEnvelope } from "../auto-reply/envelope.js";
import {
normalizeGroupActivation,
@@ -556,7 +559,7 @@ async function deliverWebReply(params: {
skipLog,
} = params;
const replyStarted = Date.now();
const textChunks = chunkText(replyResult.text || "", textLimit);
const textChunks = chunkMarkdownText(replyResult.text || "", textLimit);
const mediaList = replyResult.mediaUrls?.length
? replyResult.mediaUrls
: replyResult.mediaUrl