fix: split long memory lines

2026-01-17 21:11:56 +00:00
parent 40345642fa
commit 4b11ebb30e
3 changed files with 33 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Docs: https://docs.clawd.bot
 ### Changes

 ### Fixes
+- Memory: split overly long lines to keep embeddings under token limits.

 ## 2026.1.17-1

--- a/src/memory/internal.test.ts
+++ b/src/memory/internal.test.ts
@@ -0,0 +1,16 @@
+import { describe, expect, it } from "vitest";
+
+import { chunkMarkdown } from "./internal.js";
+
+describe("chunkMarkdown", () => {
+  it("splits overly long lines into max-sized chunks", () => {
+    const chunkTokens = 400;
+    const maxChars = chunkTokens * 4;
+    const content = "a".repeat(maxChars * 3 + 25);
+    const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 });
+    expect(chunks.length).toBeGreaterThan(1);
+    for (const chunk of chunks) {
+      expect(chunk.text.length).toBeLessThanOrEqual(maxChars);
+    }
+  });
+});
--- a/src/memory/internal.ts
+++ b/src/memory/internal.ts
@@ -144,13 +144,23 @@ export function chunkMarkdown(
  for (let i = 0; i < lines.length; i += 1) {
    const line = lines[i] ?? "";
    const lineNo = i + 1;
-    const lineSize = line.length + 1;
-    if (currentChars + lineSize > maxChars && current.length > 0) {
-      flush();
-      carryOverlap();
+    const segments: string[] = [];
+    if (line.length === 0) {
+      segments.push("");
+    } else {
+      for (let start = 0; start < line.length; start += maxChars) {
+        segments.push(line.slice(start, start + maxChars));
+      }
+    }
+    for (const segment of segments) {
+      const lineSize = segment.length + 1;
+      if (currentChars + lineSize > maxChars && current.length > 0) {
+        flush();
+        carryOverlap();
+      }
+      current.push({ line: segment, lineNo });
+      currentChars += lineSize;
    }
-    current.push({ line, lineNo });
-    currentChars += lineSize;
  }
  flush();
  return chunks;