feat(memory): add gemini batches + safe reindex

Co-authored-by: Gustavo Madeira Santana <gumadeiras@gmail.com>
2026-01-18 15:29:30 +00:00
parent be7191879a
commit 9464774133
5 changed files with 1472 additions and 715 deletions
--- a/src/memory/batch-gemini.ts
+++ b/src/memory/batch-gemini.ts
@@ -0,0 +1,407 @@
+import { createSubsystemLogger } from "../logging.js";
+import type { GeminiEmbeddingClient } from "./embeddings-gemini.js";
+import { hashText } from "./internal.js";
+
+export type GeminiBatchRequest = {
+  custom_id: string;
+  content: { parts: Array<{ text: string }> };
+  taskType: "RETRIEVAL_DOCUMENT" | "RETRIEVAL_QUERY";
+};
+
+export type GeminiBatchStatus = {
+  name?: string;
+  state?: string;
+  outputConfig?: { file?: string; fileId?: string };
+  metadata?: {
+    output?: {
+      responsesFile?: string;
+    };
+  };
+  error?: { message?: string };
+};
+
+export type GeminiBatchOutputLine = {
+  key?: string;
+  custom_id?: string;
+  request_id?: string;
+  embedding?: { values?: number[] };
+  response?: {
+    embedding?: { values?: number[] };
+    error?: { message?: string };
+  };
+  error?: { message?: string };
+};
+
+const GEMINI_BATCH_MAX_REQUESTS = 50000;
+const debugEmbeddings = process.env.CLAWDBOT_DEBUG_MEMORY_EMBEDDINGS === "1";
+const log = createSubsystemLogger("memory/embeddings");
+
+const debugLog = (message: string, meta?: Record<string, unknown>) => {
+  if (!debugEmbeddings) return;
+  const suffix = meta ? ` ${JSON.stringify(meta)}` : "";
+  log.raw(`${message}${suffix}`);
+};
+
+function getGeminiBaseUrl(gemini: GeminiEmbeddingClient): string {
+  return gemini.baseUrl?.replace(/\/$/, "") ?? "";
+}
+
+function getGeminiHeaders(
+  gemini: GeminiEmbeddingClient,
+  params: { json: boolean },
+): Record<string, string> {
+  const headers = gemini.headers ? { ...gemini.headers } : {};
+  if (params.json) {
+    if (!headers["Content-Type"] && !headers["content-type"]) {
+      headers["Content-Type"] = "application/json";
+    }
+  } else {
+    delete headers["Content-Type"];
+    delete headers["content-type"];
+  }
+  return headers;
+}
+
+function getGeminiUploadUrl(baseUrl: string): string {
+  if (baseUrl.includes("/v1beta")) {
+    return baseUrl.replace(/\/v1beta\/?$/, "/upload/v1beta");
+  }
+  return `${baseUrl.replace(/\/$/, "")}/upload`;
+}
+
+function splitGeminiBatchRequests(requests: GeminiBatchRequest[]): GeminiBatchRequest[][] {
+  if (requests.length <= GEMINI_BATCH_MAX_REQUESTS) return [requests];
+  const groups: GeminiBatchRequest[][] = [];
+  for (let i = 0; i < requests.length; i += GEMINI_BATCH_MAX_REQUESTS) {
+    groups.push(requests.slice(i, i + GEMINI_BATCH_MAX_REQUESTS));
+  }
+  return groups;
+}
+
+function buildGeminiUploadBody(params: { jsonl: string; displayName: string }): {
+  body: Blob;
+  contentType: string;
+} {
+  const boundary = `clawdbot-${hashText(params.displayName)}`;
+  const jsonPart = JSON.stringify({
+    file: {
+      displayName: params.displayName,
+      mimeType: "application/jsonl",
+    },
+  });
+  const delimiter = `--${boundary}\r\n`;
+  const closeDelimiter = `--${boundary}--\r\n`;
+  const parts = [
+    `${delimiter}Content-Type: application/json; charset=UTF-8\r\n\r\n${jsonPart}\r\n`,
+    `${delimiter}Content-Type: application/jsonl; charset=UTF-8\r\n\r\n${params.jsonl}\r\n`,
+    closeDelimiter,
+  ];
+  const body = new Blob([parts.join("")], { type: "multipart/related" });
+  return {
+    body,
+    contentType: `multipart/related; boundary=${boundary}`,
+  };
+}
+
+async function submitGeminiBatch(params: {
+  gemini: GeminiEmbeddingClient;
+  requests: GeminiBatchRequest[];
+  agentId: string;
+}): Promise<GeminiBatchStatus> {
+  const baseUrl = getGeminiBaseUrl(params.gemini);
+  const jsonl = params.requests
+    .map((request) =>
+      JSON.stringify({
+        key: request.custom_id,
+        request: {
+          content: request.content,
+          task_type: request.taskType,
+        },
+      }),
+    )
+    .join("\n");
+  const displayName = `memory-embeddings-${hashText(String(Date.now()))}`;
+  const uploadPayload = buildGeminiUploadBody({ jsonl, displayName });
+
+  const uploadUrl = `${getGeminiUploadUrl(baseUrl)}/files?uploadType=multipart`;
+  debugLog("memory embeddings: gemini batch upload", {
+    uploadUrl,
+    baseUrl,
+    requests: params.requests.length,
+  });
+  const fileRes = await fetch(uploadUrl, {
+    method: "POST",
+    headers: {
+      ...getGeminiHeaders(params.gemini, { json: false }),
+      "Content-Type": uploadPayload.contentType,
+    },
+    body: uploadPayload.body,
+  });
+  if (!fileRes.ok) {
+    const text = await fileRes.text();
+    throw new Error(`gemini batch file upload failed: ${fileRes.status} ${text}`);
+  }
+  const filePayload = (await fileRes.json()) as { name?: string; file?: { name?: string } };
+  const fileId = filePayload.name ?? filePayload.file?.name;
+  if (!fileId) {
+    throw new Error("gemini batch file upload failed: missing file id");
+  }
+
+  const batchBody = {
+    batch: {
+      displayName: `memory-embeddings-${params.agentId}`,
+      inputConfig: {
+        file_name: fileId,
+      },
+    },
+  };
+
+  const batchEndpoint = `${baseUrl}/${params.gemini.modelPath}:asyncBatchEmbedContent`;
+  debugLog("memory embeddings: gemini batch create", {
+    batchEndpoint,
+    fileId,
+  });
+  const batchRes = await fetch(batchEndpoint, {
+    method: "POST",
+    headers: getGeminiHeaders(params.gemini, { json: true }),
+    body: JSON.stringify(batchBody),
+  });
+  if (batchRes.ok) {
+    return (await batchRes.json()) as GeminiBatchStatus;
+  }
+  const text = await batchRes.text();
+  if (batchRes.status === 404) {
+    throw new Error(
+      "gemini batch create failed: 404 (asyncBatchEmbedContent not available for this model/baseUrl). Disable remote.batch.enabled or switch providers.",
+    );
+  }
+  throw new Error(`gemini batch create failed: ${batchRes.status} ${text}`);
+}
+
+async function fetchGeminiBatchStatus(params: {
+  gemini: GeminiEmbeddingClient;
+  batchName: string;
+}): Promise<GeminiBatchStatus> {
+  const baseUrl = getGeminiBaseUrl(params.gemini);
+  const name = params.batchName.startsWith("batches/") ? params.batchName : `batches/${params.batchName}`;
+  const statusUrl = `${baseUrl}/${name}`;
+  debugLog("memory embeddings: gemini batch status", { statusUrl });
+  const res = await fetch(statusUrl, {
+    headers: getGeminiHeaders(params.gemini, { json: true }),
+  });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(`gemini batch status failed: ${res.status} ${text}`);
+  }
+  return (await res.json()) as GeminiBatchStatus;
+}
+
+async function fetchGeminiFileContent(params: {
+  gemini: GeminiEmbeddingClient;
+  fileId: string;
+}): Promise<string> {
+  const baseUrl = getGeminiBaseUrl(params.gemini);
+  const file = params.fileId.startsWith("files/") ? params.fileId : `files/${params.fileId}`;
+  const downloadUrl = `${baseUrl}/${file}:download`;
+  debugLog("memory embeddings: gemini batch download", { downloadUrl });
+  const res = await fetch(downloadUrl, {
+    headers: getGeminiHeaders(params.gemini, { json: true }),
+  });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(`gemini batch file content failed: ${res.status} ${text}`);
+  }
+  return await res.text();
+}
+
+function parseGeminiBatchOutput(text: string): GeminiBatchOutputLine[] {
+  if (!text.trim()) return [];
+  return text
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .map((line) => JSON.parse(line) as GeminiBatchOutputLine);
+}
+
+async function waitForGeminiBatch(params: {
+  gemini: GeminiEmbeddingClient;
+  batchName: string;
+  wait: boolean;
+  pollIntervalMs: number;
+  timeoutMs: number;
+  debug?: (message: string, data?: Record<string, unknown>) => void;
+  initial?: GeminiBatchStatus;
+}): Promise<{ outputFileId: string }> {
+  const start = Date.now();
+  let current: GeminiBatchStatus | undefined = params.initial;
+  while (true) {
+    const status =
+      current ??
+      (await fetchGeminiBatchStatus({
+        gemini: params.gemini,
+        batchName: params.batchName,
+      }));
+    const state = status.state ?? "UNKNOWN";
+    if (["SUCCEEDED", "COMPLETED", "DONE"].includes(state)) {
+      const outputFileId =
+        status.outputConfig?.file ??
+        status.outputConfig?.fileId ??
+        status.metadata?.output?.responsesFile;
+      if (!outputFileId) {
+        throw new Error(`gemini batch ${params.batchName} completed without output file`);
+      }
+      return { outputFileId };
+    }
+    if (["FAILED", "CANCELLED", "CANCELED", "EXPIRED"].includes(state)) {
+      const message = status.error?.message ?? "unknown error";
+      throw new Error(`gemini batch ${params.batchName} ${state}: ${message}`);
+    }
+    if (!params.wait) {
+      throw new Error(`gemini batch ${params.batchName} still ${state}; wait disabled`);
+    }
+    if (Date.now() - start > params.timeoutMs) {
+      throw new Error(`gemini batch ${params.batchName} timed out after ${params.timeoutMs}ms`);
+    }
+    params.debug?.(`gemini batch ${params.batchName} ${state}; waiting ${params.pollIntervalMs}ms`);
+    await new Promise((resolve) => setTimeout(resolve, params.pollIntervalMs));
+    current = undefined;
+  }
+}
+
+async function runWithConcurrency<T>(tasks: Array<() => Promise<T>>, limit: number): Promise<T[]> {
+  if (tasks.length === 0) return [];
+  const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
+  const results: T[] = Array.from({ length: tasks.length });
+  let next = 0;
+  let firstError: unknown = null;
+
+  const workers = Array.from({ length: resolvedLimit }, async () => {
+    while (true) {
+      if (firstError) return;
+      const index = next;
+      next += 1;
+      if (index >= tasks.length) return;
+      try {
+        results[index] = await tasks[index]();
+      } catch (err) {
+        firstError = err;
+        return;
+      }
+    }
+  });
+
+  await Promise.allSettled(workers);
+  if (firstError) throw firstError;
+  return results;
+}
+
+export async function runGeminiEmbeddingBatches(params: {
+  gemini: GeminiEmbeddingClient;
+  agentId: string;
+  requests: GeminiBatchRequest[];
+  wait: boolean;
+  pollIntervalMs: number;
+  timeoutMs: number;
+  concurrency: number;
+  debug?: (message: string, data?: Record<string, unknown>) => void;
+}): Promise<Map<string, number[]>> {
+  if (params.requests.length === 0) return new Map();
+  const groups = splitGeminiBatchRequests(params.requests);
+  const byCustomId = new Map<string, number[]>();
+
+  const tasks = groups.map((group, groupIndex) => async () => {
+    const batchInfo = await submitGeminiBatch({
+      gemini: params.gemini,
+      requests: group,
+      agentId: params.agentId,
+    });
+    const batchName = batchInfo.name ?? "";
+    if (!batchName) {
+      throw new Error("gemini batch create failed: missing batch name");
+    }
+
+    params.debug?.("memory embeddings: gemini batch created", {
+      batchName,
+      state: batchInfo.state,
+      group: groupIndex + 1,
+      groups: groups.length,
+      requests: group.length,
+    });
+
+    if (!params.wait && batchInfo.state && !["SUCCEEDED", "COMPLETED", "DONE"].includes(batchInfo.state)) {
+      throw new Error(
+        `gemini batch ${batchName} submitted; enable remote.batch.wait to await completion`,
+      );
+    }
+
+    const completed =
+      batchInfo.state && ["SUCCEEDED", "COMPLETED", "DONE"].includes(batchInfo.state)
+        ? {
+            outputFileId:
+              batchInfo.outputConfig?.file ??
+              batchInfo.outputConfig?.fileId ??
+              batchInfo.metadata?.output?.responsesFile ??
+              "",
+          }
+        : await waitForGeminiBatch({
+            gemini: params.gemini,
+            batchName,
+            wait: params.wait,
+            pollIntervalMs: params.pollIntervalMs,
+            timeoutMs: params.timeoutMs,
+            debug: params.debug,
+            initial: batchInfo,
+          });
+    if (!completed.outputFileId) {
+      throw new Error(`gemini batch ${batchName} completed without output file`);
+    }
+
+    const content = await fetchGeminiFileContent({
+      gemini: params.gemini,
+      fileId: completed.outputFileId,
+    });
+    const outputLines = parseGeminiBatchOutput(content);
+    const errors: string[] = [];
+    const remaining = new Set(group.map((request) => request.custom_id));
+
+    for (const line of outputLines) {
+      const customId = line.key ?? line.custom_id ?? line.request_id;
+      if (!customId) continue;
+      remaining.delete(customId);
+      if (line.error?.message) {
+        errors.push(`${customId}: ${line.error.message}`);
+        continue;
+      }
+      if (line.response?.error?.message) {
+        errors.push(`${customId}: ${line.response.error.message}`);
+        continue;
+      }
+      const embedding =
+        line.embedding?.values ?? line.response?.embedding?.values ?? [];
+      if (embedding.length === 0) {
+        errors.push(`${customId}: empty embedding`);
+        continue;
+      }
+      byCustomId.set(customId, embedding);
+    }
+
+    if (errors.length > 0) {
+      throw new Error(`gemini batch ${batchName} failed: ${errors.join("; ")}`);
+    }
+    if (remaining.size > 0) {
+      throw new Error(`gemini batch ${batchName} missing ${remaining.size} embedding responses`);
+    }
+  });
+
+  params.debug?.("memory embeddings: gemini batch submit", {
+    requests: params.requests.length,
+    groups: groups.length,
+    wait: params.wait,
+    concurrency: params.concurrency,
+    pollIntervalMs: params.pollIntervalMs,
+    timeoutMs: params.timeoutMs,
+  });
+
+  await runWithConcurrency(tasks, params.concurrency);
+  return byCustomId;
+}
--- a/src/memory/batch-openai.ts
+++ b/src/memory/batch-openai.ts
@@ -0,0 +1,362 @@
+import type { OpenAiEmbeddingClient } from "./embeddings-openai.js";
+import { hashText } from "./internal.js";
+
+export type OpenAiBatchRequest = {
+  custom_id: string;
+  method: "POST";
+  url: "/v1/embeddings";
+  body: {
+    model: string;
+    input: string;
+  };
+};
+
+export type OpenAiBatchStatus = {
+  id?: string;
+  status?: string;
+  output_file_id?: string | null;
+  error_file_id?: string | null;
+};
+
+export type OpenAiBatchOutputLine = {
+  custom_id?: string;
+  response?: {
+    status_code?: number;
+    body?: {
+      data?: Array<{ embedding?: number[]; index?: number }>;
+      error?: { message?: string };
+    };
+  };
+  error?: { message?: string };
+};
+
+export const OPENAI_BATCH_ENDPOINT = "/v1/embeddings";
+const OPENAI_BATCH_COMPLETION_WINDOW = "24h";
+const OPENAI_BATCH_MAX_REQUESTS = 50000;
+
+function getOpenAiBaseUrl(openAi: OpenAiEmbeddingClient): string {
+  return openAi.baseUrl?.replace(/\/$/, "") ?? "";
+}
+
+function getOpenAiHeaders(
+  openAi: OpenAiEmbeddingClient,
+  params: { json: boolean },
+): Record<string, string> {
+  const headers = openAi.headers ? { ...openAi.headers } : {};
+  if (params.json) {
+    if (!headers["Content-Type"] && !headers["content-type"]) {
+      headers["Content-Type"] = "application/json";
+    }
+  } else {
+    delete headers["Content-Type"];
+    delete headers["content-type"];
+  }
+  return headers;
+}
+
+function splitOpenAiBatchRequests(requests: OpenAiBatchRequest[]): OpenAiBatchRequest[][] {
+  if (requests.length <= OPENAI_BATCH_MAX_REQUESTS) return [requests];
+  const groups: OpenAiBatchRequest[][] = [];
+  for (let i = 0; i < requests.length; i += OPENAI_BATCH_MAX_REQUESTS) {
+    groups.push(requests.slice(i, i + OPENAI_BATCH_MAX_REQUESTS));
+  }
+  return groups;
+}
+
+async function submitOpenAiBatch(params: {
+  openAi: OpenAiEmbeddingClient;
+  requests: OpenAiBatchRequest[];
+  agentId: string;
+}): Promise<OpenAiBatchStatus> {
+  const baseUrl = getOpenAiBaseUrl(params.openAi);
+  const jsonl = params.requests.map((request) => JSON.stringify(request)).join("\n");
+  const form = new FormData();
+  form.append("purpose", "batch");
+  form.append(
+    "file",
+    new Blob([jsonl], { type: "application/jsonl" }),
+    `memory-embeddings.${hashText(String(Date.now()))}.jsonl`,
+  );
+
+  const fileRes = await fetch(`${baseUrl}/files`, {
+    method: "POST",
+    headers: getOpenAiHeaders(params.openAi, { json: false }),
+    body: form,
+  });
+  if (!fileRes.ok) {
+    const text = await fileRes.text();
+    throw new Error(`openai batch file upload failed: ${fileRes.status} ${text}`);
+  }
+  const filePayload = (await fileRes.json()) as { id?: string };
+  if (!filePayload.id) {
+    throw new Error("openai batch file upload failed: missing file id");
+  }
+
+  const batchRes = await fetch(`${baseUrl}/batches`, {
+    method: "POST",
+    headers: getOpenAiHeaders(params.openAi, { json: true }),
+    body: JSON.stringify({
+      input_file_id: filePayload.id,
+      endpoint: OPENAI_BATCH_ENDPOINT,
+      completion_window: OPENAI_BATCH_COMPLETION_WINDOW,
+      metadata: {
+        source: "clawdbot-memory",
+        agent: params.agentId,
+      },
+    }),
+  });
+  if (!batchRes.ok) {
+    const text = await batchRes.text();
+    throw new Error(`openai batch create failed: ${batchRes.status} ${text}`);
+  }
+  return (await batchRes.json()) as OpenAiBatchStatus;
+}
+
+async function fetchOpenAiBatchStatus(params: {
+  openAi: OpenAiEmbeddingClient;
+  batchId: string;
+}): Promise<OpenAiBatchStatus> {
+  const baseUrl = getOpenAiBaseUrl(params.openAi);
+  const res = await fetch(`${baseUrl}/batches/${params.batchId}`, {
+    headers: getOpenAiHeaders(params.openAi, { json: true }),
+  });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(`openai batch status failed: ${res.status} ${text}`);
+  }
+  return (await res.json()) as OpenAiBatchStatus;
+}
+
+async function fetchOpenAiFileContent(params: {
+  openAi: OpenAiEmbeddingClient;
+  fileId: string;
+}): Promise<string> {
+  const baseUrl = getOpenAiBaseUrl(params.openAi);
+  const res = await fetch(`${baseUrl}/files/${params.fileId}/content`, {
+    headers: getOpenAiHeaders(params.openAi, { json: true }),
+  });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(`openai batch file content failed: ${res.status} ${text}`);
+  }
+  return await res.text();
+}
+
+function parseOpenAiBatchOutput(text: string): OpenAiBatchOutputLine[] {
+  if (!text.trim()) return [];
+  return text
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .map((line) => JSON.parse(line) as OpenAiBatchOutputLine);
+}
+
+async function readOpenAiBatchError(params: {
+  openAi: OpenAiEmbeddingClient;
+  errorFileId: string;
+}): Promise<string | undefined> {
+  try {
+    const content = await fetchOpenAiFileContent({
+      openAi: params.openAi,
+      fileId: params.errorFileId,
+    });
+    const lines = parseOpenAiBatchOutput(content);
+    const first = lines.find((line) => line.error?.message || line.response?.body?.error);
+    const message =
+      first?.error?.message ??
+      (typeof first?.response?.body?.error?.message === "string"
+        ? first?.response?.body?.error?.message
+        : undefined);
+    return message;
+  } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
+    return message ? `error file unavailable: ${message}` : undefined;
+  }
+}
+
+async function waitForOpenAiBatch(params: {
+  openAi: OpenAiEmbeddingClient;
+  batchId: string;
+  wait: boolean;
+  pollIntervalMs: number;
+  timeoutMs: number;
+  debug?: (message: string, data?: Record<string, unknown>) => void;
+  initial?: OpenAiBatchStatus;
+}): Promise<{ outputFileId: string; errorFileId?: string }> {
+  const start = Date.now();
+  let current: OpenAiBatchStatus | undefined = params.initial;
+  while (true) {
+    const status =
+      current ??
+      (await fetchOpenAiBatchStatus({
+        openAi: params.openAi,
+        batchId: params.batchId,
+      }));
+    const state = status.status ?? "unknown";
+    if (state === "completed") {
+      if (!status.output_file_id) {
+        throw new Error(`openai batch ${params.batchId} completed without output file`);
+      }
+      return {
+        outputFileId: status.output_file_id,
+        errorFileId: status.error_file_id ?? undefined,
+      };
+    }
+    if (["failed", "expired", "cancelled", "canceled"].includes(state)) {
+      const detail = status.error_file_id
+        ? await readOpenAiBatchError({ openAi: params.openAi, errorFileId: status.error_file_id })
+        : undefined;
+      const suffix = detail ? `: ${detail}` : "";
+      throw new Error(`openai batch ${params.batchId} ${state}${suffix}`);
+    }
+    if (!params.wait) {
+      throw new Error(`openai batch ${params.batchId} still ${state}; wait disabled`);
+    }
+    if (Date.now() - start > params.timeoutMs) {
+      throw new Error(`openai batch ${params.batchId} timed out after ${params.timeoutMs}ms`);
+    }
+    params.debug?.(`openai batch ${params.batchId} ${state}; waiting ${params.pollIntervalMs}ms`);
+    await new Promise((resolve) => setTimeout(resolve, params.pollIntervalMs));
+    current = undefined;
+  }
+}
+
+async function runWithConcurrency<T>(tasks: Array<() => Promise<T>>, limit: number): Promise<T[]> {
+  if (tasks.length === 0) return [];
+  const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
+  const results: T[] = Array.from({ length: tasks.length });
+  let next = 0;
+  let firstError: unknown = null;
+
+  const workers = Array.from({ length: resolvedLimit }, async () => {
+    while (true) {
+      if (firstError) return;
+      const index = next;
+      next += 1;
+      if (index >= tasks.length) return;
+      try {
+        results[index] = await tasks[index]();
+      } catch (err) {
+        firstError = err;
+        return;
+      }
+    }
+  });
+
+  await Promise.allSettled(workers);
+  if (firstError) throw firstError;
+  return results;
+}
+
+export async function runOpenAiEmbeddingBatches(params: {
+  openAi: OpenAiEmbeddingClient;
+  agentId: string;
+  requests: OpenAiBatchRequest[];
+  wait: boolean;
+  pollIntervalMs: number;
+  timeoutMs: number;
+  concurrency: number;
+  debug?: (message: string, data?: Record<string, unknown>) => void;
+}): Promise<Map<string, number[]>> {
+  if (params.requests.length === 0) return new Map();
+  const groups = splitOpenAiBatchRequests(params.requests);
+  const byCustomId = new Map<string, number[]>();
+
+  const tasks = groups.map((group, groupIndex) => async () => {
+    const batchInfo = await submitOpenAiBatch({
+      openAi: params.openAi,
+      requests: group,
+      agentId: params.agentId,
+    });
+    if (!batchInfo.id) {
+      throw new Error("openai batch create failed: missing batch id");
+    }
+
+    params.debug?.("memory embeddings: openai batch created", {
+      batchId: batchInfo.id,
+      status: batchInfo.status,
+      group: groupIndex + 1,
+      groups: groups.length,
+      requests: group.length,
+    });
+
+    if (!params.wait && batchInfo.status !== "completed") {
+      throw new Error(
+        `openai batch ${batchInfo.id} submitted; enable remote.batch.wait to await completion`,
+      );
+    }
+
+    const completed =
+      batchInfo.status === "completed"
+        ? {
+            outputFileId: batchInfo.output_file_id ?? "",
+            errorFileId: batchInfo.error_file_id ?? undefined,
+          }
+        : await waitForOpenAiBatch({
+            openAi: params.openAi,
+            batchId: batchInfo.id,
+            wait: params.wait,
+            pollIntervalMs: params.pollIntervalMs,
+            timeoutMs: params.timeoutMs,
+            debug: params.debug,
+            initial: batchInfo,
+          });
+    if (!completed.outputFileId) {
+      throw new Error(`openai batch ${batchInfo.id} completed without output file`);
+    }
+
+    const content = await fetchOpenAiFileContent({
+      openAi: params.openAi,
+      fileId: completed.outputFileId,
+    });
+    const outputLines = parseOpenAiBatchOutput(content);
+    const errors: string[] = [];
+    const remaining = new Set(group.map((request) => request.custom_id));
+
+    for (const line of outputLines) {
+      const customId = line.custom_id;
+      if (!customId) continue;
+      remaining.delete(customId);
+      if (line.error?.message) {
+        errors.push(`${customId}: ${line.error.message}`);
+        continue;
+      }
+      const response = line.response;
+      const statusCode = response?.status_code ?? 0;
+      if (statusCode >= 400) {
+        const message =
+          response?.body?.error?.message ??
+          (typeof response?.body === "string" ? response.body : undefined) ??
+          "unknown error";
+        errors.push(`${customId}: ${message}`);
+        continue;
+      }
+      const data = response?.body?.data ?? [];
+      const embedding = data[0]?.embedding ?? [];
+      if (embedding.length === 0) {
+        errors.push(`${customId}: empty embedding`);
+        continue;
+      }
+      byCustomId.set(customId, embedding);
+    }
+
+    if (errors.length > 0) {
+      throw new Error(`openai batch ${batchInfo.id} failed: ${errors.join("; ")}`);
+    }
+    if (remaining.size > 0) {
+      throw new Error(`openai batch ${batchInfo.id} missing ${remaining.size} embedding responses`);
+    }
+  });
+
+  params.debug?.("memory embeddings: openai batch submit", {
+    requests: params.requests.length,
+    groups: groups.length,
+    wait: params.wait,
+    concurrency: params.concurrency,
+    pollIntervalMs: params.pollIntervalMs,
+    timeoutMs: params.timeoutMs,
+  });
+
+  await runWithConcurrency(tasks, params.concurrency);
+  return byCustomId;
+}
--- a/src/memory/index.test.ts
+++ b/src/memory/index.test.ts
@@ -7,6 +7,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";

 let embedBatchCalls = 0;
+let failEmbeddings = false;

 vi.mock("./embeddings.js", () => {
  const embedText = (text: string) => {
@@ -24,6 +25,9 @@ vi.mock("./embeddings.js", () => {
        embedQuery: async (text: string) => embedText(text),
        embedBatch: async (texts: string[]) => {
          embedBatchCalls += 1;
+          if (failEmbeddings) {
+            throw new Error("mock embeddings failed");
+          }
          return texts.map(embedText);
        },
      },
@@ -38,6 +42,7 @@ describe("memory index", () => {

  beforeEach(async () => {
    embedBatchCalls = 0;
+    failEmbeddings = false;
    workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-mem-"));
    indexPath = path.join(workspaceDir, "index.sqlite");
    await fs.mkdir(path.join(workspaceDir, "memory"));
@@ -181,6 +186,43 @@ describe("memory index", () => {
    expect(embedBatchCalls).toBe(afterFirst);
  });

+  it("preserves existing index when forced reindex fails", async () => {
+    const cfg = {
+      agents: {
+        defaults: {
+          workspace: workspaceDir,
+          memorySearch: {
+            provider: "openai",
+            model: "mock-embed",
+            store: { path: indexPath, vector: { enabled: false } },
+            sync: { watch: false, onSessionStart: false, onSearch: false },
+            query: { minScore: 0 },
+            cache: { enabled: false },
+          },
+        },
+        list: [{ id: "main", default: true }],
+      },
+    };
+    const result = await getMemorySearchManager({ cfg, agentId: "main" });
+    expect(result.manager).not.toBeNull();
+    if (!result.manager) throw new Error("manager missing");
+    manager = result.manager;
+
+    await manager.sync({ force: true });
+    const before = manager.status();
+    expect(before.files).toBeGreaterThan(0);
+
+    failEmbeddings = true;
+    await expect(manager.sync({ force: true })).rejects.toThrow(/mock embeddings failed/i);
+
+    const after = manager.status();
+    expect(after.files).toBe(before.files);
+    expect(after.chunks).toBe(before.chunks);
+
+    const files = await fs.readdir(workspaceDir);
+    expect(files.some((name) => name.includes(".tmp-"))).toBe(false);
+  });
+
  it("finds keyword matches via hybrid search when query embedding is zero", async () => {
    const cfg = {
      agents: {
--- a/src/memory/manager.ts
+++ b/src/memory/manager.ts
--- a/src/memory/openai-batch.ts
+++ b/src/memory/openai-batch.ts
@@ -1,435 +1,2 @@
-import { retryAsync } from "../infra/retry.js";
-import type { OpenAiEmbeddingClient } from "./embeddings.js";
-import { hashText } from "./internal.js";
-
-export type OpenAiBatchRequest = {
-  custom_id: string;
-  method: "POST";
-  url: "/v1/embeddings";
-  body: {
-    model: string;
-    input: string;
-  };
-};
-
-export type OpenAiBatchStatus = {
-  id?: string;
-  status?: string;
-  output_file_id?: string | null;
-  error_file_id?: string | null;
-};
-
-export type OpenAiBatchOutputLine = {
-  custom_id?: string;
-  response?: {
-    status_code?: number;
-    body?: {
-      data?: Array<{ embedding?: number[]; index?: number }>;
-      error?: { message?: string };
-    };
-  };
-  error?: { message?: string };
-};
-
-export const OPENAI_BATCH_ENDPOINT = "/v1/embeddings";
-const OPENAI_BATCH_COMPLETION_WINDOW = "24h";
-const OPENAI_BATCH_MAX_REQUESTS = 50000;
-const OPENAI_BATCH_RETRY = {
-  attempts: 3,
-  minDelayMs: 500,
-  maxDelayMs: 5000,
-  jitter: 0.1,
-};
-
-type RetryableError = Error & { status?: number };
-
-function isRetryableBatchError(err: unknown): boolean {
-  const status =
-    typeof (err as RetryableError)?.status === "number" ? (err as RetryableError).status : undefined;
-  if (typeof status === "number") {
-    return status === 429 || status >= 500;
-  }
-  const message = err instanceof Error ? err.message : String(err);
-  return /timeout|timed out|ECONNRESET|ECONNREFUSED|EHOSTUNREACH|ENOTFOUND|EAI_AGAIN|network|fetch failed|upstream connect/i.test(
-    message,
-  );
-}
-
-function formatRetryError(err: unknown): string {
-  return err instanceof Error ? err.message : String(err);
-}
-
-function getOpenAiBaseUrl(openAi: OpenAiEmbeddingClient): string {
-  return openAi.baseUrl?.replace(/\/$/, "") ?? "";
-}
-
-function getOpenAiHeaders(
-  openAi: OpenAiEmbeddingClient,
-  params: { json: boolean },
-): Record<string, string> {
-  const headers = openAi.headers ? { ...openAi.headers } : {};
-  if (params.json) {
-    if (!headers["Content-Type"] && !headers["content-type"]) {
-      headers["Content-Type"] = "application/json";
-    }
-  } else {
-    delete headers["Content-Type"];
-    delete headers["content-type"];
-  }
-  return headers;
-}
-
-function splitOpenAiBatchRequests(requests: OpenAiBatchRequest[]): OpenAiBatchRequest[][] {
-  if (requests.length <= OPENAI_BATCH_MAX_REQUESTS) return [requests];
-  const groups: OpenAiBatchRequest[][] = [];
-  for (let i = 0; i < requests.length; i += OPENAI_BATCH_MAX_REQUESTS) {
-    groups.push(requests.slice(i, i + OPENAI_BATCH_MAX_REQUESTS));
-  }
-  return groups;
-}
-
-async function fetchOpenAiWithRetry(params: {
-  openAi: OpenAiEmbeddingClient;
-  url: string;
-  init?: RequestInit;
-  label: string;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-}): Promise<Response> {
-  return await retryAsync(
-    async () => {
-      const res = await fetch(params.url, params.init);
-      if (!res.ok) {
-        const text = await res.text();
-        const err = new Error(`openai batch ${params.label} failed: ${res.status} ${text}`);
-        (err as RetryableError).status = res.status;
-        throw err;
-      }
-      return res;
-    },
-    {
-      ...OPENAI_BATCH_RETRY,
-      label: params.label,
-      shouldRetry: isRetryableBatchError,
-      onRetry: (info) => {
-        params.debug?.(
-          `openai batch ${params.label} retry ${info.attempt}/${info.maxAttempts} in ${info.delayMs}ms`,
-          { error: formatRetryError(info.err) },
-        );
-      },
-    },
-  );
-}
-
-async function submitOpenAiBatch(params: {
-  openAi: OpenAiEmbeddingClient;
-  requests: OpenAiBatchRequest[];
-  agentId: string;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-}): Promise<OpenAiBatchStatus> {
-  const baseUrl = getOpenAiBaseUrl(params.openAi);
-  const jsonl = params.requests.map((request) => JSON.stringify(request)).join("\n");
-  const form = new FormData();
-  form.append("purpose", "batch");
-  form.append(
-    "file",
-    new Blob([jsonl], { type: "application/jsonl" }),
-    `memory-embeddings.${hashText(String(Date.now()))}.jsonl`,
-  );
-
-  const fileRes = await fetchOpenAiWithRetry({
-    openAi: params.openAi,
-    url: `${baseUrl}/files`,
-    init: {
-      method: "POST",
-      headers: getOpenAiHeaders(params.openAi, { json: false }),
-      body: form,
-    },
-    label: "file upload",
-    debug: params.debug,
-  });
-  const filePayload = (await fileRes.json()) as { id?: string };
-  if (!filePayload.id) {
-    throw new Error("openai batch file upload failed: missing file id");
-  }
-
-  const batchRes = await fetchOpenAiWithRetry({
-    openAi: params.openAi,
-    url: `${baseUrl}/batches`,
-    init: {
-      method: "POST",
-      headers: getOpenAiHeaders(params.openAi, { json: true }),
-      body: JSON.stringify({
-        input_file_id: filePayload.id,
-        endpoint: OPENAI_BATCH_ENDPOINT,
-        completion_window: OPENAI_BATCH_COMPLETION_WINDOW,
-        metadata: {
-          source: "clawdbot-memory",
-          agent: params.agentId,
-        },
-      }),
-    },
-    label: "create",
-    debug: params.debug,
-  });
-  return (await batchRes.json()) as OpenAiBatchStatus;
-}
-
-async function fetchOpenAiBatchStatus(params: {
-  openAi: OpenAiEmbeddingClient;
-  batchId: string;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-}): Promise<OpenAiBatchStatus> {
-  const baseUrl = getOpenAiBaseUrl(params.openAi);
-  const res = await fetchOpenAiWithRetry({
-    openAi: params.openAi,
-    url: `${baseUrl}/batches/${params.batchId}`,
-    init: { headers: getOpenAiHeaders(params.openAi, { json: true }) },
-    label: "status",
-    debug: params.debug,
-  });
-  return (await res.json()) as OpenAiBatchStatus;
-}
-
-async function fetchOpenAiFileContent(params: {
-  openAi: OpenAiEmbeddingClient;
-  fileId: string;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-}): Promise<string> {
-  const baseUrl = getOpenAiBaseUrl(params.openAi);
-  const res = await fetchOpenAiWithRetry({
-    openAi: params.openAi,
-    url: `${baseUrl}/files/${params.fileId}/content`,
-    init: { headers: getOpenAiHeaders(params.openAi, { json: true }) },
-    label: "file content",
-    debug: params.debug,
-  });
-  return await res.text();
-}
-
-function parseOpenAiBatchOutput(text: string): OpenAiBatchOutputLine[] {
-  if (!text.trim()) return [];
-  return text
-    .split("\n")
-    .map((line) => line.trim())
-    .filter(Boolean)
-    .map((line) => JSON.parse(line) as OpenAiBatchOutputLine);
-}
-
-async function readOpenAiBatchError(params: {
-  openAi: OpenAiEmbeddingClient;
-  errorFileId: string;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-}): Promise<string | undefined> {
-  try {
-    const content = await fetchOpenAiFileContent({
-      openAi: params.openAi,
-      fileId: params.errorFileId,
-      debug: params.debug,
-    });
-    const lines = parseOpenAiBatchOutput(content);
-    const first = lines.find((line) => line.error?.message || line.response?.body?.error);
-    const message =
-      first?.error?.message ??
-      (typeof first?.response?.body?.error?.message === "string"
-        ? first?.response?.body?.error?.message
-        : undefined);
-    return message;
-  } catch (err) {
-    const message = err instanceof Error ? err.message : String(err);
-    return message ? `error file unavailable: ${message}` : undefined;
-  }
-}
-
-async function waitForOpenAiBatch(params: {
-  openAi: OpenAiEmbeddingClient;
-  batchId: string;
-  wait: boolean;
-  pollIntervalMs: number;
-  timeoutMs: number;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-  initial?: OpenAiBatchStatus;
-}): Promise<{ outputFileId: string; errorFileId?: string }> {
-  const start = Date.now();
-  let current: OpenAiBatchStatus | undefined = params.initial;
-  while (true) {
-    const status =
-      current ??
-      (await fetchOpenAiBatchStatus({
-        openAi: params.openAi,
-        batchId: params.batchId,
-        debug: params.debug,
-      }));
-    const state = status.status ?? "unknown";
-    if (state === "completed") {
-      if (!status.output_file_id) {
-        throw new Error(`openai batch ${params.batchId} completed without output file`);
-      }
-      return {
-        outputFileId: status.output_file_id,
-        errorFileId: status.error_file_id ?? undefined,
-      };
-    }
-    if (["failed", "expired", "cancelled", "canceled"].includes(state)) {
-      const detail = status.error_file_id
-        ? await readOpenAiBatchError({
-            openAi: params.openAi,
-            errorFileId: status.error_file_id,
-            debug: params.debug,
-          })
-        : undefined;
-      const suffix = detail ? `: ${detail}` : "";
-      throw new Error(`openai batch ${params.batchId} ${state}${suffix}`);
-    }
-    if (!params.wait) {
-      throw new Error(`openai batch ${params.batchId} still ${state}; wait disabled`);
-    }
-    if (Date.now() - start > params.timeoutMs) {
-      throw new Error(`openai batch ${params.batchId} timed out after ${params.timeoutMs}ms`);
-    }
-    params.debug?.(`openai batch ${params.batchId} ${state}; waiting ${params.pollIntervalMs}ms`);
-    await new Promise((resolve) => setTimeout(resolve, params.pollIntervalMs));
-    current = undefined;
-  }
-}
-
-async function runWithConcurrency<T>(tasks: Array<() => Promise<T>>, limit: number): Promise<T[]> {
-  if (tasks.length === 0) return [];
-  const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
-  const results: T[] = Array.from({ length: tasks.length });
-  let next = 0;
-  let firstError: unknown = null;
-
-  const workers = Array.from({ length: resolvedLimit }, async () => {
-    while (true) {
-      if (firstError) return;
-      const index = next;
-      next += 1;
-      if (index >= tasks.length) return;
-      try {
-        results[index] = await tasks[index]();
-      } catch (err) {
-        firstError = err;
-        return;
-      }
-    }
-  });
-
-  await Promise.allSettled(workers);
-  if (firstError) throw firstError;
-  return results;
-}
-
-export async function runOpenAiEmbeddingBatches(params: {
-  openAi: OpenAiEmbeddingClient;
-  agentId: string;
-  requests: OpenAiBatchRequest[];
-  wait: boolean;
-  pollIntervalMs: number;
-  timeoutMs: number;
-  concurrency: number;
-  debug?: (message: string, data?: Record<string, unknown>) => void;
-}): Promise<Map<string, number[]>> {
-  if (params.requests.length === 0) return new Map();
-  const groups = splitOpenAiBatchRequests(params.requests);
-  const byCustomId = new Map<string, number[]>();
-
-  const tasks = groups.map((group, groupIndex) => async () => {
-    const batchInfo = await submitOpenAiBatch({
-      openAi: params.openAi,
-      requests: group,
-      agentId: params.agentId,
-      debug: params.debug,
-    });
-    if (!batchInfo.id) {
-      throw new Error("openai batch create failed: missing batch id");
-    }
-
-    params.debug?.("memory embeddings: openai batch created", {
-      batchId: batchInfo.id,
-      status: batchInfo.status,
-      group: groupIndex + 1,
-      groups: groups.length,
-      requests: group.length,
-    });
-
-    if (!params.wait && batchInfo.status !== "completed") {
-      throw new Error(
-        `openai batch ${batchInfo.id} submitted; enable remote.batch.wait to await completion`,
-      );
-    }
-
-    const completed =
-      batchInfo.status === "completed"
-        ? {
-            outputFileId: batchInfo.output_file_id ?? "",
-            errorFileId: batchInfo.error_file_id ?? undefined,
-          }
-        : await waitForOpenAiBatch({
-            openAi: params.openAi,
-            batchId: batchInfo.id,
-            wait: params.wait,
-            pollIntervalMs: params.pollIntervalMs,
-            timeoutMs: params.timeoutMs,
-            debug: params.debug,
-            initial: batchInfo,
-          });
-    if (!completed.outputFileId) {
-      throw new Error(`openai batch ${batchInfo.id} completed without output file`);
-    }
-
-    const content = await fetchOpenAiFileContent({
-      openAi: params.openAi,
-      fileId: completed.outputFileId,
-      debug: params.debug,
-    });
-    const outputLines = parseOpenAiBatchOutput(content);
-    const errors: string[] = [];
-    const remaining = new Set(group.map((request) => request.custom_id));
-
-    for (const line of outputLines) {
-      const customId = line.custom_id;
-      if (!customId) continue;
-      remaining.delete(customId);
-      if (line.error?.message) {
-        errors.push(`${customId}: ${line.error.message}`);
-        continue;
-      }
-      const response = line.response;
-      const statusCode = response?.status_code ?? 0;
-      if (statusCode >= 400) {
-        const message =
-          response?.body?.error?.message ??
-          (typeof response?.body === "string" ? response.body : undefined) ??
-          "unknown error";
-        errors.push(`${customId}: ${message}`);
-        continue;
-      }
-      const data = response?.body?.data ?? [];
-      const embedding = data[0]?.embedding ?? [];
-      if (embedding.length === 0) {
-        errors.push(`${customId}: empty embedding`);
-        continue;
-      }
-      byCustomId.set(customId, embedding);
-    }
-
-    if (errors.length > 0) {
-      throw new Error(`openai batch ${batchInfo.id} failed: ${errors.join("; ")}`);
-    }
-    if (remaining.size > 0) {
-      throw new Error(`openai batch ${batchInfo.id} missing ${remaining.size} embedding responses`);
-    }
-  });
-
-  params.debug?.("memory embeddings: openai batch submit", {
-    requests: params.requests.length,
-    groups: groups.length,
-    wait: params.wait,
-    concurrency: params.concurrency,
-    pollIntervalMs: params.pollIntervalMs,
-    timeoutMs: params.timeoutMs,
-  });
-
-  await runWithConcurrency(tasks, params.concurrency);
-  return byCustomId;
-}
+// Deprecated: use ./batch-openai.js
+export * from "./batch-openai.js";