fix: preserve markdown fences when chunking

This commit is contained in:
Peter Steinberger
2026-01-06 20:19:56 +01:00
parent afc42c7547
commit 67bda21811
13 changed files with 378 additions and 108 deletions

View File

@@ -1,6 +1,29 @@
import { describe, expect, it } from "vitest";
import { chunkText, resolveTextChunkLimit } from "./chunk.js";
import {
chunkMarkdownText,
chunkText,
resolveTextChunkLimit,
} from "./chunk.js";
function expectFencesBalanced(chunks: string[]) {
for (const chunk of chunks) {
let open: { markerChar: string; markerLen: number } | null = null;
for (const line of chunk.split("\n")) {
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
if (!match) continue;
const marker = match[2];
if (!open) {
open = { markerChar: marker[0], markerLen: marker.length };
continue;
}
if (open.markerChar === marker[0] && marker.length >= open.markerLen) {
open = null;
}
}
expect(open).toBe(null);
}
}
describe("chunkText", () => {
it("keeps multi-line text in one chunk when under limit", () => {
@@ -72,3 +95,79 @@ describe("resolveTextChunkLimit", () => {
expect(resolveTextChunkLimit(cfg, "telegram")).toBe(4000);
});
});
describe("chunkMarkdownText", () => {
it("keeps fenced blocks intact when a safe break exists", () => {
const prefix = "p".repeat(60);
const fence = "```bash\nline1\nline2\n```";
const suffix = "s".repeat(60);
const text = `${prefix}\n\n${fence}\n\n${suffix}`;
const chunks = chunkMarkdownText(text, 40);
expect(chunks.some((chunk) => chunk.trimEnd() === fence)).toBe(true);
expectFencesBalanced(chunks);
});
it("reopens fenced blocks when forced to split inside them", () => {
const text = `\`\`\`txt\n${"a".repeat(500)}\n\`\`\``;
const limit = 120;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("```txt\n")).toBe(true);
expect(chunk.trimEnd().endsWith("```")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("supports tilde fences", () => {
const text = `~~~sh\n${"x".repeat(600)}\n~~~`;
const limit = 140;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("~~~sh\n")).toBe(true);
expect(chunk.trimEnd().endsWith("~~~")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("supports longer fence markers for close", () => {
const text = `\`\`\`\`md\n${"y".repeat(600)}\n\`\`\`\``;
const limit = 140;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("````md\n")).toBe(true);
expect(chunk.trimEnd().endsWith("````")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("preserves indentation for indented fences", () => {
const text = ` \`\`\`js\n ${"z".repeat(600)}\n \`\`\``;
const limit = 160;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith(" ```js\n")).toBe(true);
expect(chunk.trimEnd().endsWith(" ```")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("never produces an empty fenced chunk when splitting", () => {
const text = `\`\`\`txt\n${"a".repeat(300)}\n\`\`\``;
const chunks = chunkMarkdownText(text, 60);
for (const chunk of chunks) {
const nonFenceLines = chunk
.split("\n")
.filter((line) => !/^( {0,3})(`{3,}|~{3,})(.*)$/.test(line));
expect(nonFenceLines.join("\n").trim()).not.toBe("");
}
});
});

View File

@@ -3,6 +3,11 @@
// the chunk so messages are only split when they truly exceed the limit.
import type { ClawdbotConfig } from "../config/config.js";
import {
findFenceSpanAt,
isSafeFenceBreak,
parseFenceSpans,
} from "../markdown/fences.js";
export type TextChunkProvider =
| "whatsapp"
@@ -91,3 +96,123 @@ export function chunkText(text: string, limit: number): string[] {
return chunks;
}
export function chunkMarkdownText(text: string, limit: number): string[] {
if (!text) return [];
if (limit <= 0) return [text];
if (text.length <= limit) return [text];
const chunks: string[] = [];
let remaining = text;
while (remaining.length > limit) {
const spans = parseFenceSpans(remaining);
const window = remaining.slice(0, limit);
const softBreak = pickSafeBreakIndex(window, spans);
let breakIdx = softBreak > 0 ? softBreak : limit;
const initialFence = isSafeFenceBreak(spans, breakIdx)
? undefined
: findFenceSpanAt(spans, breakIdx);
let fenceToSplit = initialFence;
if (initialFence) {
const closeLine = `${initialFence.indent}${initialFence.marker}`;
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
if (maxIdxIfNeedNewline <= 0) {
fenceToSplit = undefined;
breakIdx = limit;
} else {
const minProgressIdx = Math.min(
remaining.length,
initialFence.start + initialFence.openLine.length + 2,
);
const maxIdxIfAlreadyNewline = limit - closeLine.length;
let pickedNewline = false;
let lastNewline = remaining.lastIndexOf(
"\n",
Math.max(0, maxIdxIfAlreadyNewline - 1),
);
while (lastNewline !== -1) {
const candidateBreak = lastNewline + 1;
if (candidateBreak < minProgressIdx) break;
const candidateFence = findFenceSpanAt(spans, candidateBreak);
if (candidateFence && candidateFence.start === initialFence.start) {
breakIdx = Math.max(1, candidateBreak);
pickedNewline = true;
break;
}
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
}
if (!pickedNewline) {
if (minProgressIdx > maxIdxIfAlreadyNewline) {
fenceToSplit = undefined;
breakIdx = limit;
} else {
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
}
}
}
const fenceAtBreak = findFenceSpanAt(spans, breakIdx);
fenceToSplit =
fenceAtBreak && fenceAtBreak.start === initialFence.start
? fenceAtBreak
: undefined;
}
let rawChunk = remaining.slice(0, breakIdx);
if (!rawChunk) break;
const brokeOnSeparator =
breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
const nextStart = Math.min(
remaining.length,
breakIdx + (brokeOnSeparator ? 1 : 0),
);
let next = remaining.slice(nextStart);
if (fenceToSplit) {
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
rawChunk = rawChunk.endsWith("\n")
? `${rawChunk}${closeLine}`
: `${rawChunk}\n${closeLine}`;
next = `${fenceToSplit.openLine}\n${next}`;
} else {
next = stripLeadingNewlines(next);
}
chunks.push(rawChunk);
remaining = next;
}
if (remaining.length) chunks.push(remaining);
return chunks;
}
function stripLeadingNewlines(value: string): string {
let i = 0;
while (i < value.length && value[i] === "\n") i++;
return i > 0 ? value.slice(i) : value;
}
function pickSafeBreakIndex(
window: string,
spans: ReturnType<typeof parseFenceSpans>,
): number {
let newlineIdx = window.lastIndexOf("\n");
while (newlineIdx > 0) {
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx;
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
}
for (let i = window.length - 1; i > 0; i--) {
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i;
}
return -1;
}