fix: newline chunking across channels
This commit is contained in:
@@ -3,6 +3,7 @@ import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
chunkByNewline,
|
||||
chunkMarkdownText,
|
||||
chunkMarkdownTextWithMode,
|
||||
chunkText,
|
||||
chunkTextWithMode,
|
||||
resolveChunkMode,
|
||||
@@ -246,10 +247,10 @@ describe("chunkByNewline", () => {
|
||||
expect(chunks).toEqual(["Line one", "Line two", "Line three"]);
|
||||
});
|
||||
|
||||
it("filters empty lines", () => {
|
||||
it("preserves blank lines by folding into the next chunk", () => {
|
||||
const text = "Line one\n\n\nLine two\n\nLine three";
|
||||
const chunks = chunkByNewline(text, 1000);
|
||||
expect(chunks).toEqual(["Line one", "Line two", "Line three"]);
|
||||
expect(chunks).toEqual(["Line one", "\n\nLine two", "\nLine three"]);
|
||||
});
|
||||
|
||||
it("trims whitespace from lines", () => {
|
||||
@@ -258,6 +259,12 @@ describe("chunkByNewline", () => {
|
||||
expect(chunks).toEqual(["Line one", "Line two"]);
|
||||
});
|
||||
|
||||
it("preserves leading blank lines on the first chunk", () => {
|
||||
const text = "\n\nLine one\nLine two";
|
||||
const chunks = chunkByNewline(text, 1000);
|
||||
expect(chunks).toEqual(["\n\nLine one", "Line two"]);
|
||||
});
|
||||
|
||||
it("falls back to length-based for long lines", () => {
|
||||
const text = "Short line\n" + "a".repeat(50) + "\nAnother short";
|
||||
const chunks = chunkByNewline(text, 20);
|
||||
@@ -269,6 +276,12 @@ describe("chunkByNewline", () => {
|
||||
expect(chunks[4]).toBe("Another short");
|
||||
});
|
||||
|
||||
it("does not split long lines when splitLongLines is false", () => {
|
||||
const text = "a".repeat(50);
|
||||
const chunks = chunkByNewline(text, 20, { splitLongLines: false });
|
||||
expect(chunks).toEqual([text]);
|
||||
});
|
||||
|
||||
it("returns empty array for empty input", () => {
|
||||
expect(chunkByNewline("", 100)).toEqual([]);
|
||||
});
|
||||
@@ -276,6 +289,18 @@ describe("chunkByNewline", () => {
|
||||
it("returns empty array for whitespace-only input", () => {
|
||||
expect(chunkByNewline(" \n\n ", 100)).toEqual([]);
|
||||
});
|
||||
|
||||
it("preserves trailing blank lines on the last chunk", () => {
|
||||
const text = "Line one\n\n";
|
||||
const chunks = chunkByNewline(text, 1000);
|
||||
expect(chunks).toEqual(["Line one\n\n"]);
|
||||
});
|
||||
|
||||
it("keeps whitespace when trimLines is false", () => {
|
||||
const text = " indented line \nNext";
|
||||
const chunks = chunkByNewline(text, 1000, { trimLines: false });
|
||||
expect(chunks).toEqual([" indented line ", "Next"]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("chunkTextWithMode", () => {
|
||||
@@ -292,6 +317,26 @@ describe("chunkTextWithMode", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("chunkMarkdownTextWithMode", () => {
|
||||
it("uses markdown-aware chunking for length mode", () => {
|
||||
const text = "Line one\nLine two";
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "length")).toEqual(chunkMarkdownText(text, 1000));
|
||||
});
|
||||
|
||||
it("uses newline-based chunking for newline mode", () => {
|
||||
const text = "Line one\nLine two";
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual(["Line one", "Line two"]);
|
||||
});
|
||||
|
||||
it("does not split inside code fences for newline mode", () => {
|
||||
const text = "```js\nconst a = 1;\nconst b = 2;\n```\nAfter";
|
||||
expect(chunkMarkdownTextWithMode(text, 1000, "newline")).toEqual([
|
||||
"```js\nconst a = 1;\nconst b = 2;\n```",
|
||||
"After",
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveChunkMode", () => {
|
||||
it("returns length as default", () => {
|
||||
expect(resolveChunkMode(undefined, "telegram")).toBe("length");
|
||||
@@ -304,16 +349,16 @@ describe("resolveChunkMode", () => {
|
||||
expect(resolveChunkMode(cfg, "__internal__")).toBe("length");
|
||||
});
|
||||
|
||||
it("supports provider-level overrides for bluebubbles", () => {
|
||||
const cfg = { channels: { bluebubbles: { chunkMode: "newline" as const } } };
|
||||
expect(resolveChunkMode(cfg, "bluebubbles")).toBe("newline");
|
||||
it("supports provider-level overrides for slack", () => {
|
||||
const cfg = { channels: { slack: { chunkMode: "newline" as const } } };
|
||||
expect(resolveChunkMode(cfg, "slack")).toBe("newline");
|
||||
expect(resolveChunkMode(cfg, "discord")).toBe("length");
|
||||
});
|
||||
|
||||
it("supports account-level overrides for bluebubbles", () => {
|
||||
it("supports account-level overrides for slack", () => {
|
||||
const cfg = {
|
||||
channels: {
|
||||
bluebubbles: {
|
||||
slack: {
|
||||
chunkMode: "length" as const,
|
||||
accounts: {
|
||||
primary: { chunkMode: "newline" as const },
|
||||
@@ -321,12 +366,7 @@ describe("resolveChunkMode", () => {
|
||||
},
|
||||
},
|
||||
};
|
||||
expect(resolveChunkMode(cfg, "bluebubbles", "primary")).toBe("newline");
|
||||
expect(resolveChunkMode(cfg, "bluebubbles", "other")).toBe("length");
|
||||
});
|
||||
|
||||
it("ignores chunkMode for non-bluebubbles providers", () => {
|
||||
const cfg = { channels: { ["telegram" as string]: { chunkMode: "newline" as const } } };
|
||||
expect(resolveChunkMode(cfg, "telegram")).toBe("length");
|
||||
expect(resolveChunkMode(cfg, "slack", "primary")).toBe("newline");
|
||||
expect(resolveChunkMode(cfg, "slack", "other")).toBe("length");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -101,8 +101,6 @@ export function resolveChunkMode(
|
||||
accountId?: string | null,
|
||||
): ChunkMode {
|
||||
if (!provider || provider === INTERNAL_MESSAGE_CHANNEL) return DEFAULT_CHUNK_MODE;
|
||||
// Chunk mode is only supported for BlueBubbles.
|
||||
if (provider !== "bluebubbles") return DEFAULT_CHUNK_MODE;
|
||||
const channelsConfig = cfg?.channels as Record<string, unknown> | undefined;
|
||||
const providerConfig = (channelsConfig?.[provider] ??
|
||||
(cfg as Record<string, unknown> | undefined)?.[provider]) as ProviderChunkConfig | undefined;
|
||||
@@ -111,25 +109,56 @@ export function resolveChunkMode(
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text on newlines, filtering empty lines.
|
||||
* Lines exceeding maxLineLength are further split using length-based chunking.
|
||||
* Split text on newlines, trimming line whitespace.
|
||||
* Blank lines are folded into the next non-empty line as leading "\n" prefixes.
|
||||
* Long lines can be split by length (default) or kept intact via splitLongLines:false.
|
||||
*/
|
||||
export function chunkByNewline(text: string, maxLineLength: number): string[] {
|
||||
export function chunkByNewline(
|
||||
text: string,
|
||||
maxLineLength: number,
|
||||
opts?: {
|
||||
splitLongLines?: boolean;
|
||||
trimLines?: boolean;
|
||||
isSafeBreak?: (index: number) => boolean;
|
||||
},
|
||||
): string[] {
|
||||
if (!text) return [];
|
||||
const lines = text.split("\n");
|
||||
if (maxLineLength <= 0) return text.trim() ? [text] : [];
|
||||
const splitLongLines = opts?.splitLongLines !== false;
|
||||
const trimLines = opts?.trimLines !== false;
|
||||
const lines = splitByNewline(text, opts?.isSafeBreak);
|
||||
const chunks: string[] = [];
|
||||
let pendingBlankLines = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue; // skip empty lines
|
||||
|
||||
if (trimmed.length <= maxLineLength) {
|
||||
chunks.push(trimmed);
|
||||
} else {
|
||||
// Long line: fall back to length-based chunking
|
||||
const subChunks = chunkText(trimmed, maxLineLength);
|
||||
chunks.push(...subChunks);
|
||||
if (!trimmed) {
|
||||
pendingBlankLines += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const maxPrefix = Math.max(0, maxLineLength - 1);
|
||||
const cappedBlankLines = pendingBlankLines > 0 ? Math.min(pendingBlankLines, maxPrefix) : 0;
|
||||
const prefix = cappedBlankLines > 0 ? "\n".repeat(cappedBlankLines) : "";
|
||||
pendingBlankLines = 0;
|
||||
|
||||
const lineValue = trimLines ? trimmed : line;
|
||||
if (!splitLongLines || lineValue.length + prefix.length <= maxLineLength) {
|
||||
chunks.push(prefix + lineValue);
|
||||
continue;
|
||||
}
|
||||
|
||||
const firstLimit = Math.max(1, maxLineLength - prefix.length);
|
||||
const first = lineValue.slice(0, firstLimit);
|
||||
chunks.push(prefix + first);
|
||||
const remaining = lineValue.slice(firstLimit);
|
||||
if (remaining) {
|
||||
chunks.push(...chunkText(remaining, maxLineLength));
|
||||
}
|
||||
}
|
||||
|
||||
if (pendingBlankLines > 0 && chunks.length > 0) {
|
||||
chunks[chunks.length - 1] += "\n".repeat(pendingBlankLines);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
@@ -140,11 +169,59 @@ export function chunkByNewline(text: string, maxLineLength: number): string[] {
|
||||
*/
|
||||
export function chunkTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
|
||||
if (mode === "newline") {
|
||||
return chunkByNewline(text, limit);
|
||||
const chunks: string[] = [];
|
||||
const lineChunks = chunkByNewline(text, limit, { splitLongLines: false });
|
||||
for (const line of lineChunks) {
|
||||
const nested = chunkText(line, limit);
|
||||
if (!nested.length && line) {
|
||||
chunks.push(line);
|
||||
continue;
|
||||
}
|
||||
chunks.push(...nested);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
return chunkText(text, limit);
|
||||
}
|
||||
|
||||
export function chunkMarkdownTextWithMode(text: string, limit: number, mode: ChunkMode): string[] {
|
||||
if (mode === "newline") {
|
||||
const spans = parseFenceSpans(text);
|
||||
const chunks: string[] = [];
|
||||
const lineChunks = chunkByNewline(text, limit, {
|
||||
splitLongLines: false,
|
||||
trimLines: false,
|
||||
isSafeBreak: (index) => isSafeFenceBreak(spans, index),
|
||||
});
|
||||
for (const line of lineChunks) {
|
||||
const nested = chunkMarkdownText(line, limit);
|
||||
if (!nested.length && line) {
|
||||
chunks.push(line);
|
||||
continue;
|
||||
}
|
||||
chunks.push(...nested);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
return chunkMarkdownText(text, limit);
|
||||
}
|
||||
|
||||
function splitByNewline(
|
||||
text: string,
|
||||
isSafeBreak: (index: number) => boolean = () => true,
|
||||
): string[] {
|
||||
const lines: string[] = [];
|
||||
let start = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
if (text[i] === "\n" && isSafeBreak(i)) {
|
||||
lines.push(text.slice(start, i));
|
||||
start = i + 1;
|
||||
}
|
||||
}
|
||||
lines.push(text.slice(start));
|
||||
return lines;
|
||||
}
|
||||
|
||||
export function chunkText(text: string, limit: number): string[] {
|
||||
if (!text) return [];
|
||||
if (limit <= 0) return [text];
|
||||
|
||||
@@ -69,7 +69,7 @@ export function resolveBlockStreamingChunking(
|
||||
});
|
||||
const chunkCfg = cfg?.agents?.defaults?.blockStreamingChunk;
|
||||
|
||||
// BlueBubbles-only: if chunkMode is "newline", use newline-based streaming
|
||||
// When chunkMode is "newline", use newline-based streaming
|
||||
const channelChunkMode = resolveChunkMode(cfg, providerKey, accountId);
|
||||
if (channelChunkMode === "newline") {
|
||||
// For newline mode: use very low minChars to flush quickly on newlines
|
||||
@@ -103,7 +103,7 @@ export function resolveBlockStreamingCoalescing(
|
||||
): BlockStreamingCoalescing | undefined {
|
||||
const providerKey = normalizeChunkProvider(provider);
|
||||
|
||||
// BlueBubbles-only: when chunkMode is "newline", disable coalescing to send each line immediately
|
||||
// When chunkMode is "newline", disable coalescing to send each line immediately
|
||||
const channelChunkMode = resolveChunkMode(cfg, providerKey, accountId);
|
||||
if (channelChunkMode === "newline") {
|
||||
return undefined;
|
||||
|
||||
Reference in New Issue
Block a user