fix: paragraph-aware newline chunking (#1726)
Thanks @tyler6204 Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
This commit is contained in:
@@ -344,6 +344,11 @@ describe("chunkMarkdownTextWithMode", () => {
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||
});
|
||||
|
||||
it("defers long markdown paragraphs to markdown chunking in newline mode", () => {
|
||||
const text = `\`\`\`js\n${"const a = 1;\n".repeat(20)}\`\`\``;
|
||||
expect(chunkMarkdownTextWithMode(text, 40, "newline")).toEqual(chunkMarkdownText(text, 40));
|
||||
});
|
||||
|
||||
it("does not split on blank lines inside a fenced code block", () => {
|
||||
const text = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```";
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([text]);
|
||||
|
||||
@@ -173,10 +173,16 @@ export function chunkByNewline(
|
||||
* - Only breaks at paragraph separators ("\n\n" or more, allowing whitespace on blank lines)
|
||||
* - Packs multiple paragraphs into a single chunk up to `limit`
|
||||
* - Falls back to length-based splitting when a single paragraph exceeds `limit`
|
||||
* (unless `splitLongParagraphs` is disabled)
|
||||
*/
|
||||
export function chunkByParagraph(text: string, limit: number): string[] {
|
||||
export function chunkByParagraph(
|
||||
text: string,
|
||||
limit: number,
|
||||
opts?: { splitLongParagraphs?: boolean },
|
||||
): string[] {
|
||||
if (!text) return [];
|
||||
if (limit <= 0) return [text];
|
||||
const splitLongParagraphs = opts?.splitLongParagraphs !== false;
|
||||
|
||||
// Normalize to \n so blank line detection is consistent.
|
||||
const normalized = text.replace(/\r\n?/g, "\n");
|
||||
@@ -186,7 +192,9 @@ export function chunkByParagraph(text: string, limit: number): string[] {
|
||||
// boundaries, not only exceeding a length limit.)
|
||||
const paragraphRe = /\n[\t ]*\n+/;
|
||||
if (!paragraphRe.test(normalized)) {
|
||||
return normalized.length <= limit ? [normalized] : chunkText(normalized, limit);
|
||||
if (normalized.length <= limit) return [normalized];
|
||||
if (!splitLongParagraphs) return [normalized];
|
||||
return chunkText(normalized, limit);
|
||||
}
|
||||
|
||||
const spans = parseFenceSpans(normalized);
|
||||
@@ -213,6 +221,8 @@ export function chunkByParagraph(text: string, limit: number): string[] {
|
||||
if (!paragraph.trim()) continue;
|
||||
if (paragraph.length <= limit) {
|
||||
chunks.push(paragraph);
|
||||
} else if (!splitLongParagraphs) {
|
||||
chunks.push(paragraph);
|
||||
} else {
|
||||
chunks.push(...chunkText(paragraph, limit));
|
||||
}
|
||||
@@ -235,7 +245,7 @@ export function chunkMarkdownTextWithMode(text: string, limit: number, mode: Chu
|
||||
if (mode === "newline") {
|
||||
// Paragraph chunking is fence-safe because we never split at arbitrary indices.
|
||||
// If a paragraph must be split by length, defer to the markdown-aware chunker.
|
||||
const paragraphChunks = chunkByParagraph(text, limit);
|
||||
const paragraphChunks = chunkByParagraph(text, limit, { splitLongParagraphs: false });
|
||||
const out: string[] = [];
|
||||
for (const chunk of paragraphChunks) {
|
||||
const nested = chunkMarkdownText(chunk, limit);
|
||||
|
||||
@@ -58,7 +58,7 @@ describe("chunkDiscordText", () => {
|
||||
maxLines: 50,
|
||||
chunkMode: "newline",
|
||||
});
|
||||
expect(chunks).toEqual(["```js\nconst a = 1;\nconst b = 2;\n```", "After"]);
|
||||
expect(chunks).toEqual([text]);
|
||||
});
|
||||
|
||||
it("reserves space for closing fences when chunking", () => {
|
||||
|
||||
@@ -192,7 +192,7 @@ describe("deliverOutboundPayloads", () => {
|
||||
expect(sendWhatsApp).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
"+1555",
|
||||
"\nLine two",
|
||||
"Line two",
|
||||
expect.objectContaining({ verbose: false }),
|
||||
);
|
||||
});
|
||||
@@ -241,9 +241,8 @@ describe("deliverOutboundPayloads", () => {
|
||||
payloads: [{ text }],
|
||||
});
|
||||
|
||||
expect(chunker).toHaveBeenCalledTimes(2);
|
||||
expect(chunker).toHaveBeenNthCalledWith(1, "```js\nconst a = 1;\nconst b = 2;\n```", 4000);
|
||||
expect(chunker).toHaveBeenNthCalledWith(2, "After", 4000);
|
||||
expect(chunker).toHaveBeenCalledTimes(1);
|
||||
expect(chunker).toHaveBeenNthCalledWith(1, text, 4000);
|
||||
});
|
||||
|
||||
it("uses iMessage media maxBytes from agent fallback", async () => {
|
||||
|
||||
Reference in New Issue
Block a user