Files
clawdbot/src/agents/pi-embedded-block-chunker.ts
Peter Steinberger c379191f80 chore: migrate to oxlint and oxfmt
Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com>
2026-01-14 15:02:19 +00:00

240 lines
7.4 KiB
TypeScript

import { findFenceSpanAt, isSafeFenceBreak, parseFenceSpans } from "../markdown/fences.js";
export type BlockReplyChunking = {
minChars: number;
maxChars: number;
breakPreference?: "paragraph" | "newline" | "sentence";
};
type FenceSplit = {
closeFenceLine: string;
reopenFenceLine: string;
};
type BreakResult = {
index: number;
fenceSplit?: FenceSplit;
};
export class EmbeddedBlockChunker {
#buffer = "";
readonly #chunking: BlockReplyChunking;
constructor(chunking: BlockReplyChunking) {
this.#chunking = chunking;
}
append(text: string) {
if (!text) return;
this.#buffer += text;
}
reset() {
this.#buffer = "";
}
get bufferedText() {
return this.#buffer;
}
hasBuffered(): boolean {
return this.#buffer.length > 0;
}
drain(params: { force: boolean; emit: (chunk: string) => void }) {
// KNOWN: We cannot split inside fenced code blocks (Markdown breaks + UI glitches).
// When forced (maxChars), we close + reopen the fence to keep Markdown valid.
const { force, emit } = params;
const minChars = Math.max(1, Math.floor(this.#chunking.minChars));
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
if (this.#buffer.length < minChars && !force) return;
if (force && this.#buffer.length <= maxChars) {
if (this.#buffer.trim().length > 0) {
emit(this.#buffer);
}
this.#buffer = "";
return;
}
while (this.#buffer.length >= minChars || (force && this.#buffer.length > 0)) {
const breakResult =
force && this.#buffer.length <= maxChars
? this.#pickSoftBreakIndex(this.#buffer, 1)
: this.#pickBreakIndex(this.#buffer, force ? 1 : undefined);
if (breakResult.index <= 0) {
if (force) {
emit(this.#buffer);
this.#buffer = "";
}
return;
}
const breakIdx = breakResult.index;
let rawChunk = this.#buffer.slice(0, breakIdx);
if (rawChunk.trim().length === 0) {
this.#buffer = stripLeadingNewlines(this.#buffer.slice(breakIdx)).trimStart();
continue;
}
let nextBuffer = this.#buffer.slice(breakIdx);
const fenceSplit = breakResult.fenceSplit;
if (fenceSplit) {
const closeFence = rawChunk.endsWith("\n")
? `${fenceSplit.closeFenceLine}\n`
: `\n${fenceSplit.closeFenceLine}\n`;
rawChunk = `${rawChunk}${closeFence}`;
const reopenFence = fenceSplit.reopenFenceLine.endsWith("\n")
? fenceSplit.reopenFenceLine
: `${fenceSplit.reopenFenceLine}\n`;
nextBuffer = `${reopenFence}${nextBuffer}`;
}
emit(rawChunk);
if (fenceSplit) {
this.#buffer = nextBuffer;
} else {
const nextStart =
breakIdx < this.#buffer.length && /\s/.test(this.#buffer[breakIdx])
? breakIdx + 1
: breakIdx;
this.#buffer = stripLeadingNewlines(this.#buffer.slice(nextStart));
}
if (this.#buffer.length < minChars && !force) return;
if (this.#buffer.length < maxChars && !force) return;
}
}
#pickSoftBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
if (buffer.length < minChars) return { index: -1 };
const fenceSpans = parseFenceSpans(buffer);
const preference = this.#chunking.breakPreference ?? "paragraph";
if (preference === "paragraph") {
let paragraphIdx = buffer.indexOf("\n\n");
while (paragraphIdx !== -1) {
const candidates = [paragraphIdx, paragraphIdx + 1];
for (const candidate of candidates) {
if (candidate < minChars) continue;
if (candidate < 0 || candidate >= buffer.length) continue;
if (isSafeFenceBreak(fenceSpans, candidate)) {
return { index: candidate };
}
}
paragraphIdx = buffer.indexOf("\n\n", paragraphIdx + 2);
}
}
if (preference === "paragraph" || preference === "newline") {
let newlineIdx = buffer.indexOf("\n");
while (newlineIdx !== -1) {
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, newlineIdx)) {
return { index: newlineIdx };
}
newlineIdx = buffer.indexOf("\n", newlineIdx + 1);
}
}
if (preference !== "newline") {
const matches = buffer.matchAll(/[.!?](?=\s|$)/g);
let sentenceIdx = -1;
for (const match of matches) {
const at = match.index ?? -1;
if (at < minChars) continue;
const candidate = at + 1;
if (isSafeFenceBreak(fenceSpans, candidate)) {
sentenceIdx = candidate;
}
}
if (sentenceIdx >= minChars) return { index: sentenceIdx };
}
return { index: -1 };
}
#pickBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
if (buffer.length < minChars) return { index: -1 };
const window = buffer.slice(0, Math.min(maxChars, buffer.length));
const fenceSpans = parseFenceSpans(buffer);
const preference = this.#chunking.breakPreference ?? "paragraph";
if (preference === "paragraph") {
let paragraphIdx = window.lastIndexOf("\n\n");
while (paragraphIdx >= minChars) {
const candidates = [paragraphIdx, paragraphIdx + 1];
for (const candidate of candidates) {
if (candidate < minChars) continue;
if (candidate < 0 || candidate >= buffer.length) continue;
if (isSafeFenceBreak(fenceSpans, candidate)) {
return { index: candidate };
}
}
paragraphIdx = window.lastIndexOf("\n\n", paragraphIdx - 1);
}
}
if (preference === "paragraph" || preference === "newline") {
let newlineIdx = window.lastIndexOf("\n");
while (newlineIdx >= minChars) {
if (isSafeFenceBreak(fenceSpans, newlineIdx)) {
return { index: newlineIdx };
}
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
}
}
if (preference !== "newline") {
const matches = window.matchAll(/[.!?](?=\s|$)/g);
let sentenceIdx = -1;
for (const match of matches) {
const at = match.index ?? -1;
if (at < minChars) continue;
const candidate = at + 1;
if (isSafeFenceBreak(fenceSpans, candidate)) {
sentenceIdx = candidate;
}
}
if (sentenceIdx >= minChars) return { index: sentenceIdx };
}
if (preference === "newline" && buffer.length < maxChars) {
return { index: -1 };
}
for (let i = window.length - 1; i >= minChars; i--) {
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
return { index: i };
}
}
if (buffer.length >= maxChars) {
if (isSafeFenceBreak(fenceSpans, maxChars)) return { index: maxChars };
const fence = findFenceSpanAt(fenceSpans, maxChars);
if (fence) {
return {
index: maxChars,
fenceSplit: {
closeFenceLine: `${fence.indent}${fence.marker}`,
reopenFenceLine: fence.openLine,
},
};
}
return { index: maxChars };
}
return { index: -1 };
}
}
function stripLeadingNewlines(value: string): string {
let i = 0;
while (i < value.length && value[i] === "\n") i++;
return i > 0 ? value.slice(i) : value;
}