249 lines
7.0 KiB
TypeScript
249 lines
7.0 KiB
TypeScript
// Utilities for splitting outbound text into platform-sized chunks without
|
|
// unintentionally breaking on newlines. Using [\s\S] keeps newlines inside
|
|
// the chunk so messages are only split when they truly exceed the limit.
|
|
|
|
import type { ClawdbotConfig } from "../config/config.js";
|
|
import {
|
|
findFenceSpanAt,
|
|
isSafeFenceBreak,
|
|
parseFenceSpans,
|
|
} from "../markdown/fences.js";
|
|
import { normalizeAccountId } from "../routing/session-key.js";
|
|
|
|
export type TextChunkProvider =
|
|
| "whatsapp"
|
|
| "telegram"
|
|
| "discord"
|
|
| "slack"
|
|
| "signal"
|
|
| "imessage"
|
|
| "webchat";
|
|
|
|
const DEFAULT_CHUNK_LIMIT_BY_PROVIDER: Record<TextChunkProvider, number> = {
|
|
whatsapp: 4000,
|
|
telegram: 4000,
|
|
discord: 2000,
|
|
slack: 4000,
|
|
signal: 4000,
|
|
imessage: 4000,
|
|
webchat: 4000,
|
|
};
|
|
|
|
export function resolveTextChunkLimit(
|
|
cfg: ClawdbotConfig | undefined,
|
|
provider?: TextChunkProvider,
|
|
accountId?: string | null,
|
|
): number {
|
|
const providerOverride = (() => {
|
|
if (!provider) return undefined;
|
|
const normalizedAccountId = normalizeAccountId(accountId);
|
|
if (provider === "whatsapp") {
|
|
return cfg?.whatsapp?.textChunkLimit;
|
|
}
|
|
if (provider === "telegram") {
|
|
return (
|
|
cfg?.telegram?.accounts?.[normalizedAccountId]?.textChunkLimit ??
|
|
cfg?.telegram?.textChunkLimit
|
|
);
|
|
}
|
|
if (provider === "discord") {
|
|
return (
|
|
cfg?.discord?.accounts?.[normalizedAccountId]?.textChunkLimit ??
|
|
cfg?.discord?.textChunkLimit
|
|
);
|
|
}
|
|
if (provider === "slack") {
|
|
return (
|
|
cfg?.slack?.accounts?.[normalizedAccountId]?.textChunkLimit ??
|
|
cfg?.slack?.textChunkLimit
|
|
);
|
|
}
|
|
if (provider === "signal") {
|
|
return (
|
|
cfg?.signal?.accounts?.[normalizedAccountId]?.textChunkLimit ??
|
|
cfg?.signal?.textChunkLimit
|
|
);
|
|
}
|
|
if (provider === "imessage") {
|
|
return (
|
|
cfg?.imessage?.accounts?.[normalizedAccountId]?.textChunkLimit ??
|
|
cfg?.imessage?.textChunkLimit
|
|
);
|
|
}
|
|
return undefined;
|
|
})();
|
|
if (typeof providerOverride === "number" && providerOverride > 0) {
|
|
return providerOverride;
|
|
}
|
|
if (provider) return DEFAULT_CHUNK_LIMIT_BY_PROVIDER[provider];
|
|
return 4000;
|
|
}
|
|
|
|
export function chunkText(text: string, limit: number): string[] {
|
|
if (!text) return [];
|
|
if (limit <= 0) return [text];
|
|
if (text.length <= limit) return [text];
|
|
|
|
const chunks: string[] = [];
|
|
let remaining = text;
|
|
|
|
while (remaining.length > limit) {
|
|
const window = remaining.slice(0, limit);
|
|
|
|
// 1) Prefer a newline break inside the window.
|
|
let breakIdx = window.lastIndexOf("\n");
|
|
|
|
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
|
|
if (breakIdx <= 0) {
|
|
for (let i = window.length - 1; i >= 0; i--) {
|
|
if (/\s/.test(window[i])) {
|
|
breakIdx = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3) Fallback: hard break exactly at the limit.
|
|
if (breakIdx <= 0) breakIdx = limit;
|
|
|
|
const rawChunk = remaining.slice(0, breakIdx);
|
|
const chunk = rawChunk.trimEnd();
|
|
if (chunk.length > 0) {
|
|
chunks.push(chunk);
|
|
}
|
|
|
|
// If we broke on whitespace/newline, skip that separator; for hard breaks keep it.
|
|
const brokeOnSeparator =
|
|
breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
|
|
const nextStart = Math.min(
|
|
remaining.length,
|
|
breakIdx + (brokeOnSeparator ? 1 : 0),
|
|
);
|
|
remaining = remaining.slice(nextStart).trimStart();
|
|
}
|
|
|
|
if (remaining.length) chunks.push(remaining);
|
|
|
|
return chunks;
|
|
}
|
|
|
|
export function chunkMarkdownText(text: string, limit: number): string[] {
|
|
if (!text) return [];
|
|
if (limit <= 0) return [text];
|
|
if (text.length <= limit) return [text];
|
|
|
|
const chunks: string[] = [];
|
|
let remaining = text;
|
|
|
|
while (remaining.length > limit) {
|
|
const spans = parseFenceSpans(remaining);
|
|
const window = remaining.slice(0, limit);
|
|
|
|
const softBreak = pickSafeBreakIndex(window, spans);
|
|
let breakIdx = softBreak > 0 ? softBreak : limit;
|
|
|
|
const initialFence = isSafeFenceBreak(spans, breakIdx)
|
|
? undefined
|
|
: findFenceSpanAt(spans, breakIdx);
|
|
|
|
let fenceToSplit = initialFence;
|
|
if (initialFence) {
|
|
const closeLine = `${initialFence.indent}${initialFence.marker}`;
|
|
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
|
|
|
|
if (maxIdxIfNeedNewline <= 0) {
|
|
fenceToSplit = undefined;
|
|
breakIdx = limit;
|
|
} else {
|
|
const minProgressIdx = Math.min(
|
|
remaining.length,
|
|
initialFence.start + initialFence.openLine.length + 2,
|
|
);
|
|
const maxIdxIfAlreadyNewline = limit - closeLine.length;
|
|
|
|
let pickedNewline = false;
|
|
let lastNewline = remaining.lastIndexOf(
|
|
"\n",
|
|
Math.max(0, maxIdxIfAlreadyNewline - 1),
|
|
);
|
|
while (lastNewline !== -1) {
|
|
const candidateBreak = lastNewline + 1;
|
|
if (candidateBreak < minProgressIdx) break;
|
|
const candidateFence = findFenceSpanAt(spans, candidateBreak);
|
|
if (candidateFence && candidateFence.start === initialFence.start) {
|
|
breakIdx = Math.max(1, candidateBreak);
|
|
pickedNewline = true;
|
|
break;
|
|
}
|
|
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
|
|
}
|
|
|
|
if (!pickedNewline) {
|
|
if (minProgressIdx > maxIdxIfAlreadyNewline) {
|
|
fenceToSplit = undefined;
|
|
breakIdx = limit;
|
|
} else {
|
|
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
|
|
}
|
|
}
|
|
}
|
|
|
|
const fenceAtBreak = findFenceSpanAt(spans, breakIdx);
|
|
fenceToSplit =
|
|
fenceAtBreak && fenceAtBreak.start === initialFence.start
|
|
? fenceAtBreak
|
|
: undefined;
|
|
}
|
|
|
|
let rawChunk = remaining.slice(0, breakIdx);
|
|
if (!rawChunk) break;
|
|
|
|
const brokeOnSeparator =
|
|
breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
|
|
const nextStart = Math.min(
|
|
remaining.length,
|
|
breakIdx + (brokeOnSeparator ? 1 : 0),
|
|
);
|
|
let next = remaining.slice(nextStart);
|
|
|
|
if (fenceToSplit) {
|
|
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
|
|
rawChunk = rawChunk.endsWith("\n")
|
|
? `${rawChunk}${closeLine}`
|
|
: `${rawChunk}\n${closeLine}`;
|
|
next = `${fenceToSplit.openLine}\n${next}`;
|
|
} else {
|
|
next = stripLeadingNewlines(next);
|
|
}
|
|
|
|
chunks.push(rawChunk);
|
|
remaining = next;
|
|
}
|
|
|
|
if (remaining.length) chunks.push(remaining);
|
|
return chunks;
|
|
}
|
|
|
|
function stripLeadingNewlines(value: string): string {
|
|
let i = 0;
|
|
while (i < value.length && value[i] === "\n") i++;
|
|
return i > 0 ? value.slice(i) : value;
|
|
}
|
|
|
|
function pickSafeBreakIndex(
|
|
window: string,
|
|
spans: ReturnType<typeof parseFenceSpans>,
|
|
): number {
|
|
let newlineIdx = window.lastIndexOf("\n");
|
|
while (newlineIdx > 0) {
|
|
if (isSafeFenceBreak(spans, newlineIdx)) return newlineIdx;
|
|
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
|
|
}
|
|
|
|
for (let i = window.length - 1; i > 0; i--) {
|
|
if (/\s/.test(window[i]) && isSafeFenceBreak(spans, i)) return i;
|
|
}
|
|
|
|
return -1;
|
|
}
|