diff --git a/CHANGELOG.md b/CHANGELOG.md index 68540eb61..a337f1b0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Docs: https://docs.clawd.bot ### Changes ### Fixes +- Memory: split overly long lines to keep embeddings under token limits. ## 2026.1.17-1 diff --git a/src/memory/internal.test.ts b/src/memory/internal.test.ts new file mode 100644 index 000000000..29c698779 --- /dev/null +++ b/src/memory/internal.test.ts @@ -0,0 +1,16 @@ +import { describe, expect, it } from "vitest"; + +import { chunkMarkdown } from "./internal.js"; + +describe("chunkMarkdown", () => { + it("splits overly long lines into max-sized chunks", () => { + const chunkTokens = 400; + const maxChars = chunkTokens * 4; + const content = "a".repeat(maxChars * 3 + 25); + const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 }); + expect(chunks.length).toBeGreaterThan(1); + for (const chunk of chunks) { + expect(chunk.text.length).toBeLessThanOrEqual(maxChars); + } + }); +}); diff --git a/src/memory/internal.ts b/src/memory/internal.ts index cb3cecad6..e53c48982 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -144,13 +144,23 @@ export function chunkMarkdown( for (let i = 0; i < lines.length; i += 1) { const line = lines[i] ?? ""; const lineNo = i + 1; - const lineSize = line.length + 1; - if (currentChars + lineSize > maxChars && current.length > 0) { - flush(); - carryOverlap(); + const segments: string[] = []; + if (line.length === 0) { + segments.push(""); + } else { + for (let start = 0; start < line.length; start += maxChars) { + segments.push(line.slice(start, start + maxChars)); + } + } + for (const segment of segments) { + const lineSize = segment.length + 1; + if (currentChars + lineSize > maxChars && current.length > 0) { + flush(); + carryOverlap(); + } + current.push({ line: segment, lineNo }); + currentChars += lineSize; } - current.push({ line, lineNo }); - currentChars += lineSize; } flush(); return chunks;