fix: split long memory lines
This commit is contained in:
@@ -7,6 +7,7 @@ Docs: https://docs.clawd.bot
|
||||
### Changes
|
||||
|
||||
### Fixes
|
||||
- Memory: split overly long lines to keep embeddings under token limits.
|
||||
|
||||
## 2026.1.17-1
|
||||
|
||||
|
||||
16
src/memory/internal.test.ts
Normal file
16
src/memory/internal.test.ts
Normal file
@@ -0,0 +1,16 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { chunkMarkdown } from "./internal.js";
|
||||
|
||||
describe("chunkMarkdown", () => {
|
||||
it("splits overly long lines into max-sized chunks", () => {
|
||||
const chunkTokens = 400;
|
||||
const maxChars = chunkTokens * 4;
|
||||
const content = "a".repeat(maxChars * 3 + 25);
|
||||
const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 });
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
for (const chunk of chunks) {
|
||||
expect(chunk.text.length).toBeLessThanOrEqual(maxChars);
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -144,13 +144,23 @@ export function chunkMarkdown(
|
||||
for (let i = 0; i < lines.length; i += 1) {
|
||||
const line = lines[i] ?? "";
|
||||
const lineNo = i + 1;
|
||||
const lineSize = line.length + 1;
|
||||
if (currentChars + lineSize > maxChars && current.length > 0) {
|
||||
flush();
|
||||
carryOverlap();
|
||||
const segments: string[] = [];
|
||||
if (line.length === 0) {
|
||||
segments.push("");
|
||||
} else {
|
||||
for (let start = 0; start < line.length; start += maxChars) {
|
||||
segments.push(line.slice(start, start + maxChars));
|
||||
}
|
||||
}
|
||||
for (const segment of segments) {
|
||||
const lineSize = segment.length + 1;
|
||||
if (currentChars + lineSize > maxChars && current.length > 0) {
|
||||
flush();
|
||||
carryOverlap();
|
||||
}
|
||||
current.push({ line: segment, lineNo });
|
||||
currentChars += lineSize;
|
||||
}
|
||||
current.push({ line, lineNo });
|
||||
currentChars += lineSize;
|
||||
}
|
||||
flush();
|
||||
return chunks;
|
||||
|
||||
Reference in New Issue
Block a user