fix: preserve markdown fences when chunking

This commit is contained in:
Peter Steinberger
2026-01-06 20:19:56 +01:00
parent afc42c7547
commit 67bda21811
13 changed files with 378 additions and 108 deletions

View File

@@ -1,6 +1,29 @@
import { describe, expect, it } from "vitest";
import { chunkText, resolveTextChunkLimit } from "./chunk.js";
import {
chunkMarkdownText,
chunkText,
resolveTextChunkLimit,
} from "./chunk.js";
function expectFencesBalanced(chunks: string[]) {
for (const chunk of chunks) {
let open: { markerChar: string; markerLen: number } | null = null;
for (const line of chunk.split("\n")) {
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
if (!match) continue;
const marker = match[2];
if (!open) {
open = { markerChar: marker[0], markerLen: marker.length };
continue;
}
if (open.markerChar === marker[0] && marker.length >= open.markerLen) {
open = null;
}
}
expect(open).toBe(null);
}
}
describe("chunkText", () => {
it("keeps multi-line text in one chunk when under limit", () => {
@@ -72,3 +95,79 @@ describe("resolveTextChunkLimit", () => {
expect(resolveTextChunkLimit(cfg, "telegram")).toBe(4000);
});
});
describe("chunkMarkdownText", () => {
it("keeps fenced blocks intact when a safe break exists", () => {
const prefix = "p".repeat(60);
const fence = "```bash\nline1\nline2\n```";
const suffix = "s".repeat(60);
const text = `${prefix}\n\n${fence}\n\n${suffix}`;
const chunks = chunkMarkdownText(text, 40);
expect(chunks.some((chunk) => chunk.trimEnd() === fence)).toBe(true);
expectFencesBalanced(chunks);
});
it("reopens fenced blocks when forced to split inside them", () => {
const text = `\`\`\`txt\n${"a".repeat(500)}\n\`\`\``;
const limit = 120;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("```txt\n")).toBe(true);
expect(chunk.trimEnd().endsWith("```")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("supports tilde fences", () => {
const text = `~~~sh\n${"x".repeat(600)}\n~~~`;
const limit = 140;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("~~~sh\n")).toBe(true);
expect(chunk.trimEnd().endsWith("~~~")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("supports longer fence markers for close", () => {
const text = `\`\`\`\`md\n${"y".repeat(600)}\n\`\`\`\``;
const limit = 140;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith("````md\n")).toBe(true);
expect(chunk.trimEnd().endsWith("````")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("preserves indentation for indented fences", () => {
const text = ` \`\`\`js\n ${"z".repeat(600)}\n \`\`\``;
const limit = 160;
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.length).toBeLessThanOrEqual(limit);
expect(chunk.startsWith(" ```js\n")).toBe(true);
expect(chunk.trimEnd().endsWith(" ```")).toBe(true);
}
expectFencesBalanced(chunks);
});
it("never produces an empty fenced chunk when splitting", () => {
const text = `\`\`\`txt\n${"a".repeat(300)}\n\`\`\``;
const chunks = chunkMarkdownText(text, 60);
for (const chunk of chunks) {
const nonFenceLines = chunk
.split("\n")
.filter((line) => !/^( {0,3})(`{3,}|~{3,})(.*)$/.test(line));
expect(nonFenceLines.join("\n").trim()).not.toBe("");
}
});
});