fix: split long memory lines

This commit is contained in:
Peter Steinberger
2026-01-17 21:11:56 +00:00
parent 40345642fa
commit 4b11ebb30e
3 changed files with 33 additions and 6 deletions

View File

@@ -7,6 +7,7 @@ Docs: https://docs.clawd.bot
### Changes
### Fixes
- Memory: split overly long lines to keep embeddings under token limits.
## 2026.1.17-1

View File

@@ -0,0 +1,16 @@
import { describe, expect, it } from "vitest";
import { chunkMarkdown } from "./internal.js";
describe("chunkMarkdown", () => {
it("splits overly long lines into max-sized chunks", () => {
const chunkTokens = 400;
const maxChars = chunkTokens * 4;
const content = "a".repeat(maxChars * 3 + 25);
const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 });
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.text.length).toBeLessThanOrEqual(maxChars);
}
});
});

View File

@@ -144,13 +144,23 @@ export function chunkMarkdown(
for (let i = 0; i < lines.length; i += 1) {
const line = lines[i] ?? "";
const lineNo = i + 1;
const lineSize = line.length + 1;
if (currentChars + lineSize > maxChars && current.length > 0) {
flush();
carryOverlap();
const segments: string[] = [];
if (line.length === 0) {
segments.push("");
} else {
for (let start = 0; start < line.length; start += maxChars) {
segments.push(line.slice(start, start + maxChars));
}
}
for (const segment of segments) {
const lineSize = segment.length + 1;
if (currentChars + lineSize > maxChars && current.length > 0) {
flush();
carryOverlap();
}
current.push({ line: segment, lineNo });
currentChars += lineSize;
}
current.push({ line, lineNo });
currentChars += lineSize;
}
flush();
return chunks;