fix: preserve markdown fences when chunking
This commit is contained in:
@@ -1,6 +1,29 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { chunkText, resolveTextChunkLimit } from "./chunk.js";
|
||||
import {
|
||||
chunkMarkdownText,
|
||||
chunkText,
|
||||
resolveTextChunkLimit,
|
||||
} from "./chunk.js";
|
||||
|
||||
function expectFencesBalanced(chunks: string[]) {
|
||||
for (const chunk of chunks) {
|
||||
let open: { markerChar: string; markerLen: number } | null = null;
|
||||
for (const line of chunk.split("\n")) {
|
||||
const match = line.match(/^( {0,3})(`{3,}|~{3,})(.*)$/);
|
||||
if (!match) continue;
|
||||
const marker = match[2];
|
||||
if (!open) {
|
||||
open = { markerChar: marker[0], markerLen: marker.length };
|
||||
continue;
|
||||
}
|
||||
if (open.markerChar === marker[0] && marker.length >= open.markerLen) {
|
||||
open = null;
|
||||
}
|
||||
}
|
||||
expect(open).toBe(null);
|
||||
}
|
||||
}
|
||||
|
||||
describe("chunkText", () => {
|
||||
it("keeps multi-line text in one chunk when under limit", () => {
|
||||
@@ -72,3 +95,79 @@ describe("resolveTextChunkLimit", () => {
|
||||
expect(resolveTextChunkLimit(cfg, "telegram")).toBe(4000);
|
||||
});
|
||||
});
|
||||
|
||||
describe("chunkMarkdownText", () => {
|
||||
it("keeps fenced blocks intact when a safe break exists", () => {
|
||||
const prefix = "p".repeat(60);
|
||||
const fence = "```bash\nline1\nline2\n```";
|
||||
const suffix = "s".repeat(60);
|
||||
const text = `${prefix}\n\n${fence}\n\n${suffix}`;
|
||||
|
||||
const chunks = chunkMarkdownText(text, 40);
|
||||
expect(chunks.some((chunk) => chunk.trimEnd() === fence)).toBe(true);
|
||||
expectFencesBalanced(chunks);
|
||||
});
|
||||
|
||||
it("reopens fenced blocks when forced to split inside them", () => {
|
||||
const text = `\`\`\`txt\n${"a".repeat(500)}\n\`\`\``;
|
||||
const limit = 120;
|
||||
const chunks = chunkMarkdownText(text, limit);
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
for (const chunk of chunks) {
|
||||
expect(chunk.length).toBeLessThanOrEqual(limit);
|
||||
expect(chunk.startsWith("```txt\n")).toBe(true);
|
||||
expect(chunk.trimEnd().endsWith("```")).toBe(true);
|
||||
}
|
||||
expectFencesBalanced(chunks);
|
||||
});
|
||||
|
||||
it("supports tilde fences", () => {
|
||||
const text = `~~~sh\n${"x".repeat(600)}\n~~~`;
|
||||
const limit = 140;
|
||||
const chunks = chunkMarkdownText(text, limit);
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
for (const chunk of chunks) {
|
||||
expect(chunk.length).toBeLessThanOrEqual(limit);
|
||||
expect(chunk.startsWith("~~~sh\n")).toBe(true);
|
||||
expect(chunk.trimEnd().endsWith("~~~")).toBe(true);
|
||||
}
|
||||
expectFencesBalanced(chunks);
|
||||
});
|
||||
|
||||
it("supports longer fence markers for close", () => {
|
||||
const text = `\`\`\`\`md\n${"y".repeat(600)}\n\`\`\`\``;
|
||||
const limit = 140;
|
||||
const chunks = chunkMarkdownText(text, limit);
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
for (const chunk of chunks) {
|
||||
expect(chunk.length).toBeLessThanOrEqual(limit);
|
||||
expect(chunk.startsWith("````md\n")).toBe(true);
|
||||
expect(chunk.trimEnd().endsWith("````")).toBe(true);
|
||||
}
|
||||
expectFencesBalanced(chunks);
|
||||
});
|
||||
|
||||
it("preserves indentation for indented fences", () => {
|
||||
const text = ` \`\`\`js\n ${"z".repeat(600)}\n \`\`\``;
|
||||
const limit = 160;
|
||||
const chunks = chunkMarkdownText(text, limit);
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
for (const chunk of chunks) {
|
||||
expect(chunk.length).toBeLessThanOrEqual(limit);
|
||||
expect(chunk.startsWith(" ```js\n")).toBe(true);
|
||||
expect(chunk.trimEnd().endsWith(" ```")).toBe(true);
|
||||
}
|
||||
expectFencesBalanced(chunks);
|
||||
});
|
||||
|
||||
it("never produces an empty fenced chunk when splitting", () => {
|
||||
const text = `\`\`\`txt\n${"a".repeat(300)}\n\`\`\``;
|
||||
const chunks = chunkMarkdownText(text, 60);
|
||||
for (const chunk of chunks) {
|
||||
const nonFenceLines = chunk
|
||||
.split("\n")
|
||||
.filter((line) => !/^( {0,3})(`{3,}|~{3,})(.*)$/.test(line));
|
||||
expect(nonFenceLines.join("\n").trim()).not.toBe("");
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
// the chunk so messages are only split when they truly exceed the limit.
|
||||
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import {
|
||||
findFenceSpanAt,
|
||||
isSafeFenceBreak,
|
||||
parseFenceSpans,
|
||||
} from "../markdown/fences.js";
|
||||
|
||||
export type TextChunkProvider =
|
||||
| "whatsapp"
|
||||
@@ -91,3 +96,123 @@ export function chunkText(text: string, limit: number): string[] {
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
export function chunkMarkdownText(text: string, limit: number): string[] {
|
||||
if (!text) return [];
|
||||
if (limit <= 0) return [text];
|
||||
if (text.length <= limit) return [text];
|
||||
|
||||
const chunks: string[] = [];
|
||||
let remaining = text;
|
||||
|
||||
while (remaining.length > limit) {
|
||||
const spans = parseFenceSpans(remaining);
|
||||
const window = remaining.slice(0, limit);
|
||||
|
||||
const softBreak = pickSafeBreakIndex(window, spans);
|
||||
let breakIdx = softBreak > 0 ? softBreak : limit;
|
||||
|
||||
const initialFence = isSafeFenceBreak(spans, breakIdx)
|
||||
? undefined
|
||||
: findFenceSpanAt(spans, breakIdx);
|
||||
|
||||
let fenceToSplit = initialFence;
|
||||
if (initialFence) {
|
||||
const closeLine = `${initialFence.indent}${initialFence.marker}`;
|
||||
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
|
||||
|
||||
if (maxIdxIfNeedNewline <= 0) {
|
||||
fenceToSplit = undefined;
|
||||
breakIdx = limit;
|
||||
} else {
|
||||
const minProgressIdx = Math.min(
|
||||
remaining.length,
|
||||
initialFence.start + initialFence.openLine.length + 2,
|
||||
);
|
||||
const maxIdxIfAlreadyNewline = limit - closeLine.length;
|
||||
|
||||
let pickedNewline = false;
|
||||
let lastNewline = remaining.lastIndexOf(
|
||||
"\n",
|
||||
Math.max(0, maxIdxIfAlreadyNewline - 1),
|
||||
);
|
||||
while (lastNewline !== -1) {
|
||||
const candidateBreak = lastNewline + 1;
|
||||
if (candidateBreak < minProgressIdx) break;
|
||||
const candidateFence = findFenceSpanAt(spans, candidateBreak);
|
||||
if (candidateFence && candidateFence.start === initialFence.start) {
|
||||
breakIdx = Math.max(1, candidateBreak);
|
||||
pickedNewline = true;
|
||||
break;
|
||||
}
|
||||
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
|
||||
}
|
||||
|
||||
if (!pickedNewline) {
|
||||
if (minProgressIdx > maxIdxIfAlreadyNewline) {
|
||||
fenceToSplit = undefined;
|
||||
breakIdx = limit;
|
||||
} else {
|
||||
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const fenceAtBreak = findFenceSpanAt(spans, breakIdx);
|
||||
fenceToSplit =
|
||||
fenceAtBreak && fenceAtBreak.start === initialFence.start
|
||||
? fenceAtBreak
|
||||
: undefined;
|
||||
}
|
||||
|
||||
let rawChunk = remaining.slice(0, breakIdx);
|
||||
if (!rawChunk) break;
|
||||
|
||||
const brokeOnSeparator =
|
||||
breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
|
||||
const nextStart = Math.min(
|
||||
remaining.length,
|
||||
breakIdx + (brokeOnSeparator ? 1 : 0),
|
||||
);
|
||||
let next = remaining.slice(nextStart);
|
||||
|
||||
if (fenceToSplit) {
|
||||
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
|
||||
rawChunk = rawChunk.endsWith("\n")
|
||||
? `${rawChunk}${closeLine}`
|
||||
: `${rawChunk}\n${closeLine}`;
|
||||
next = `${fenceToSplit.openLine}\n${next}`;
|
||||
} else {
|
||||
next = stripLeadingNewlines(next);
|
||||
}
|
||||
|
||||
chunks.push(rawChunk);
|
||||
remaining = next;
|
||||
}
|
||||
|
||||
if (remaining.length) chunks.push(remaining);
|
||||
return chunks;
|
||||
}
|
||||
|
||||
function stripLeadingNewlines(value: string): string {
|
||||
let i = 0;
|
||||
while (i < value.length && value[i] === "\n") i++;
|
||||
return i > 0 ? value.slice(i) : value;
|
||||
}
|
||||
|
||||
function pickSafeBreakIndex(
|
||||
window: string,
|
||||
spans: ReturnType<typeof parseFenceSpans>,
|
||||
): number {
|
||||
let newlineIdx = window.lastIndexOf("\n");
|
||||
while (newlineIdx > 0) {
|
||||
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx;
|
||||
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
|
||||
}
|
||||
|
||||
for (let i = window.length - 1; i > 0; i--) {
|
||||
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user