fix: reset session after compaction overflow

This commit is contained in:
Peter Steinberger
2026-01-12 00:28:02 +00:00
parent 32df2ef7bd
commit 67743325ee
6 changed files with 485 additions and 320 deletions

View File

@@ -29,6 +29,7 @@
- Agents: treat message tool errors as failures so fallback replies still send; require `to` + `message` for `action=send`. (#717) — thanks @theglove44. - Agents: treat message tool errors as failures so fallback replies still send; require `to` + `message` for `action=send`. (#717) — thanks @theglove44.
- Agents: route subagent transcripts to the target agent sessions directory and add regression coverage. (#708) — thanks @xMikeMickelson. - Agents: route subagent transcripts to the target agent sessions directory and add regression coverage. (#708) — thanks @xMikeMickelson.
- Agents/Tools: preserve action enums when flattening tool schemas. (#708) — thanks @xMikeMickelson. - Agents/Tools: preserve action enums when flattening tool schemas. (#708) — thanks @xMikeMickelson.
- Agents: reset sessions and retry when auto-compaction overflows instead of crashing the gateway.
## 2026.1.10 ## 2026.1.10

View File

@@ -7,6 +7,7 @@ import {
formatAssistantErrorText, formatAssistantErrorText,
isBillingErrorMessage, isBillingErrorMessage,
isCloudCodeAssistFormatError, isCloudCodeAssistFormatError,
isCompactionFailureError,
isContextOverflowError, isContextOverflowError,
isFailoverErrorMessage, isFailoverErrorMessage,
isMessagingToolDuplicate, isMessagingToolDuplicate,
@@ -208,6 +209,8 @@ describe("isContextOverflowError", () => {
"Request exceeds the maximum size", "Request exceeds the maximum size",
"context length exceeded", "context length exceeded",
"Maximum context length", "Maximum context length",
"prompt is too long: 208423 tokens > 200000 maximum",
"Context overflow: Summarization failed",
"413 Request Entity Too Large", "413 Request Entity Too Large",
]; ];
for (const sample of samples) { for (const sample of samples) {
@@ -220,6 +223,26 @@ describe("isContextOverflowError", () => {
}); });
}); });
describe("isCompactionFailureError", () => {
it("matches compaction overflow failures", () => {
const samples = [
"Context overflow: Summarization failed: 400 {\"message\":\"prompt is too long\"}",
"auto-compaction failed due to context overflow",
"Compaction failed: prompt is too long",
];
for (const sample of samples) {
expect(isCompactionFailureError(sample)).toBe(true);
}
});
it("ignores non-compaction overflow errors", () => {
expect(
isCompactionFailureError("Context overflow: prompt too large"),
).toBe(false);
expect(isCompactionFailureError("rate limit exceeded")).toBe(false);
});
});
describe("isBillingErrorMessage", () => { describe("isBillingErrorMessage", () => {
it("matches credit / payment failures", () => { it("matches credit / payment failures", () => {
const samples = [ const samples = [

View File

@@ -244,10 +244,24 @@ export function isContextOverflowError(errorMessage?: string): boolean {
lower.includes("request exceeds the maximum size") || lower.includes("request exceeds the maximum size") ||
lower.includes("context length exceeded") || lower.includes("context length exceeded") ||
lower.includes("maximum context length") || lower.includes("maximum context length") ||
lower.includes("prompt is too long") ||
lower.includes("context overflow") ||
(lower.includes("413") && lower.includes("too large")) (lower.includes("413") && lower.includes("too large"))
); );
} }
export function isCompactionFailureError(errorMessage?: string): boolean {
if (!errorMessage) return false;
if (!isContextOverflowError(errorMessage)) return false;
const lower = errorMessage.toLowerCase();
return (
lower.includes("summarization failed") ||
lower.includes("auto-compaction") ||
lower.includes("compaction failed") ||
lower.includes("compaction")
);
}
export function formatAssistantErrorText( export function formatAssistantErrorText(
msg: AssistantMessage, msg: AssistantMessage,
opts?: { cfg?: ClawdbotConfig; sessionKey?: string }, opts?: { cfg?: ClawdbotConfig; sessionKey?: string },

View File

@@ -36,6 +36,7 @@ import { isCacheEnabled, resolveCacheTtlMs } from "../config/cache-utils.js";
import type { ClawdbotConfig } from "../config/config.js"; import type { ClawdbotConfig } from "../config/config.js";
import { resolveProviderCapabilities } from "../config/provider-capabilities.js"; import { resolveProviderCapabilities } from "../config/provider-capabilities.js";
import { getMachineDisplayName } from "../infra/machine-name.js"; import { getMachineDisplayName } from "../infra/machine-name.js";
import { registerUnhandledRejectionHandler } from "../infra/unhandled-rejections.js";
import { createSubsystemLogger } from "../logging.js"; import { createSubsystemLogger } from "../logging.js";
import { import {
type enqueueCommand, type enqueueCommand,
@@ -85,6 +86,7 @@ import {
formatAssistantErrorText, formatAssistantErrorText,
isAuthAssistantError, isAuthAssistantError,
isCloudCodeAssistFormatError, isCloudCodeAssistFormatError,
isCompactionFailureError,
isContextOverflowError, isContextOverflowError,
isFailoverAssistantError, isFailoverAssistantError,
isFailoverErrorMessage, isFailoverErrorMessage,
@@ -408,6 +410,13 @@ type EmbeddedPiQueueHandle = {
const log = createSubsystemLogger("agent/embedded"); const log = createSubsystemLogger("agent/embedded");
const GOOGLE_TURN_ORDERING_CUSTOM_TYPE = "google-turn-ordering-bootstrap"; const GOOGLE_TURN_ORDERING_CUSTOM_TYPE = "google-turn-ordering-bootstrap";
registerUnhandledRejectionHandler((reason) => {
const message = describeUnknownError(reason);
if (!isCompactionFailureError(message)) return false;
log.error(`Auto-compaction failed (unhandled): ${message}`);
return true;
});
type CustomEntryLike = { type?: unknown; customType?: unknown }; type CustomEntryLike = { type?: unknown; customType?: unknown };
function hasGoogleTurnOrderingMarker(sessionManager: SessionManager): boolean { function hasGoogleTurnOrderingMarker(sessionManager: SessionManager): boolean {

View File

@@ -375,6 +375,57 @@ describe("runReplyAgent typing (heartbeat)", () => {
} }
}); });
it("retries after compaction failure by resetting the session", async () => {
const prevStateDir = process.env.CLAWDBOT_STATE_DIR;
const stateDir = await fs.mkdtemp(
path.join(tmpdir(), "clawdbot-session-compaction-reset-"),
);
process.env.CLAWDBOT_STATE_DIR = stateDir;
try {
const sessionId = "session";
const storePath = path.join(stateDir, "sessions", "sessions.json");
const sessionEntry = { sessionId, updatedAt: Date.now() };
const sessionStore = { main: sessionEntry };
await fs.mkdir(path.dirname(storePath), { recursive: true });
await fs.writeFile(storePath, JSON.stringify(sessionStore), "utf-8");
runEmbeddedPiAgentMock
.mockImplementationOnce(async () => {
throw new Error(
"Context overflow: Summarization failed: 400 {\"message\":\"prompt is too long\"}",
);
})
.mockImplementationOnce(async () => ({
payloads: [{ text: "ok" }],
meta: {},
}));
const callsBefore = runEmbeddedPiAgentMock.mock.calls.length;
const { run } = createMinimalRun({
sessionEntry,
sessionStore,
sessionKey: "main",
storePath,
});
const res = await run();
expect(runEmbeddedPiAgentMock.mock.calls.length - callsBefore).toBe(2);
const payload = Array.isArray(res) ? res[0] : res;
expect(payload).toMatchObject({ text: "ok" });
expect(sessionStore.main.sessionId).not.toBe(sessionId);
const persisted = JSON.parse(await fs.readFile(storePath, "utf-8"));
expect(persisted.main.sessionId).toBe(sessionStore.main.sessionId);
} finally {
if (prevStateDir) {
process.env.CLAWDBOT_STATE_DIR = prevStateDir;
} else {
delete process.env.CLAWDBOT_STATE_DIR;
}
}
});
it("still replies even if session reset fails to persist", async () => { it("still replies even if session reset fails to persist", async () => {
const prevStateDir = process.env.CLAWDBOT_STATE_DIR; const prevStateDir = process.env.CLAWDBOT_STATE_DIR;
const stateDir = await fs.mkdtemp( const stateDir = await fs.mkdtemp(

View File

@@ -7,6 +7,10 @@ import { DEFAULT_CONTEXT_TOKENS } from "../../agents/defaults.js";
import { resolveModelAuthMode } from "../../agents/model-auth.js"; import { resolveModelAuthMode } from "../../agents/model-auth.js";
import { runWithModelFallback } from "../../agents/model-fallback.js"; import { runWithModelFallback } from "../../agents/model-fallback.js";
import { isCliProvider } from "../../agents/model-selection.js"; import { isCliProvider } from "../../agents/model-selection.js";
import {
isCompactionFailureError,
isContextOverflowError,
} from "../../agents/pi-embedded-helpers.js";
import { import {
queueEmbeddedPiMessage, queueEmbeddedPiMessage,
runEmbeddedPiAgent, runEmbeddedPiAgent,
@@ -15,6 +19,7 @@ import { hasNonzeroUsage, type NormalizedUsage } from "../../agents/usage.js";
import type { ClawdbotConfig } from "../../config/config.js"; import type { ClawdbotConfig } from "../../config/config.js";
import { import {
loadSessionStore, loadSessionStore,
resolveAgentIdFromSessionKey,
resolveSessionTranscriptPath, resolveSessionTranscriptPath,
type SessionEntry, type SessionEntry,
saveSessionStore, saveSessionStore,
@@ -231,6 +236,10 @@ export async function runReplyAgent(params: {
typingMode, typingMode,
} = params; } = params;
let activeSessionEntry = sessionEntry;
let activeSessionStore = sessionStore;
let activeIsNewSession = isNewSession;
const isHeartbeat = opts?.isHeartbeat === true; const isHeartbeat = opts?.isHeartbeat === true;
const typingSignals = createTypingSignaler({ const typingSignals = createTypingSignaler({
typing, typing,
@@ -303,11 +312,11 @@ export async function runReplyAgent(params: {
followupRun.prompt, followupRun.prompt,
); );
if (steered && !shouldFollowup) { if (steered && !shouldFollowup) {
if (sessionEntry && sessionStore && sessionKey) { if (activeSessionEntry && activeSessionStore && sessionKey) {
sessionEntry.updatedAt = Date.now(); activeSessionEntry.updatedAt = Date.now();
sessionStore[sessionKey] = sessionEntry; activeSessionStore[sessionKey] = activeSessionEntry;
if (storePath) { if (storePath) {
await saveSessionStore(storePath, sessionStore); await saveSessionStore(storePath, activeSessionStore);
} }
} }
typing.cleanup(); typing.cleanup();
@@ -317,11 +326,11 @@ export async function runReplyAgent(params: {
if (isActive && (shouldFollowup || resolvedQueue.mode === "steer")) { if (isActive && (shouldFollowup || resolvedQueue.mode === "steer")) {
enqueueFollowupRun(queueKey, followupRun, resolvedQueue); enqueueFollowupRun(queueKey, followupRun, resolvedQueue);
if (sessionEntry && sessionStore && sessionKey) { if (activeSessionEntry && activeSessionStore && sessionKey) {
sessionEntry.updatedAt = Date.now(); activeSessionEntry.updatedAt = Date.now();
sessionStore[sessionKey] = sessionEntry; activeSessionStore[sessionKey] = activeSessionEntry;
if (storePath) { if (storePath) {
await saveSessionStore(storePath, sessionStore); await saveSessionStore(storePath, activeSessionStore);
} }
} }
typing.cleanup(); typing.cleanup();
@@ -332,8 +341,8 @@ export async function runReplyAgent(params: {
opts, opts,
typing, typing,
typingMode, typingMode,
sessionEntry, sessionEntry: activeSessionEntry,
sessionStore, sessionStore: activeSessionStore,
sessionKey, sessionKey,
storePath, storePath,
defaultModel, defaultModel,
@@ -348,6 +357,46 @@ export async function runReplyAgent(params: {
let didLogHeartbeatStrip = false; let didLogHeartbeatStrip = false;
let autoCompactionCompleted = false; let autoCompactionCompleted = false;
let responseUsageLine: string | undefined; let responseUsageLine: string | undefined;
const resetSessionAfterCompactionFailure = async (
reason: string,
): Promise<boolean> => {
if (!sessionKey || !activeSessionStore || !storePath) return false;
const nextSessionId = crypto.randomUUID();
const nextEntry: SessionEntry = {
...(activeSessionStore[sessionKey] ?? activeSessionEntry),
sessionId: nextSessionId,
updatedAt: Date.now(),
systemSent: false,
abortedLastRun: false,
};
const agentId = resolveAgentIdFromSessionKey(sessionKey);
const topicId =
typeof sessionCtx.MessageThreadId === "number"
? sessionCtx.MessageThreadId
: undefined;
const nextSessionFile = resolveSessionTranscriptPath(
nextSessionId,
agentId,
topicId,
);
nextEntry.sessionFile = nextSessionFile;
activeSessionStore[sessionKey] = nextEntry;
try {
await saveSessionStore(storePath, activeSessionStore);
} catch (err) {
defaultRuntime.error(
`Failed to persist session reset after compaction failure (${sessionKey}): ${String(err)}`,
);
}
followupRun.run.sessionId = nextSessionId;
followupRun.run.sessionFile = nextSessionFile;
activeSessionEntry = nextEntry;
activeIsNewSession = true;
defaultRuntime.error(
`Auto-compaction failed (${reason}). Restarting session ${sessionKey} -> ${nextSessionId} and retrying.`,
);
return true;
};
try { try {
const runId = crypto.randomUUID(); const runId = crypto.randomUUID();
if (sessionKey) { if (sessionKey) {
@@ -359,6 +408,8 @@ export async function runReplyAgent(params: {
let runResult: Awaited<ReturnType<typeof runEmbeddedPiAgent>>; let runResult: Awaited<ReturnType<typeof runEmbeddedPiAgent>>;
let fallbackProvider = followupRun.run.provider; let fallbackProvider = followupRun.run.provider;
let fallbackModel = followupRun.run.model; let fallbackModel = followupRun.run.model;
let didResetAfterCompactionFailure = false;
while (true) {
try { try {
const allowPartialStream = !( const allowPartialStream = !(
followupRun.run.reasoningLevel === "stream" && opts?.onReasoningStream followupRun.run.reasoningLevel === "stream" && opts?.onReasoningStream
@@ -378,7 +429,7 @@ export async function runReplyAgent(params: {
startedAt, startedAt,
}, },
}); });
const cliSessionId = getCliSessionId(sessionEntry, provider); const cliSessionId = getCliSessionId(activeSessionEntry, provider);
return runCliAgent({ return runCliAgent({
sessionId: followupRun.run.sessionId, sessionId: followupRun.run.sessionId,
sessionKey, sessionKey,
@@ -572,7 +623,8 @@ export async function runReplyAgent(params: {
parsed.audioAsVoice || payload.audioAsVoice, parsed.audioAsVoice || payload.audioAsVoice,
), ),
replyToId: taggedPayload.replyToId ?? parsed.replyToId, replyToId: taggedPayload.replyToId ?? parsed.replyToId,
replyToTag: taggedPayload.replyToTag || parsed.replyToTag, replyToTag:
taggedPayload.replyToTag || parsed.replyToTag,
replyToCurrent: replyToCurrent:
taggedPayload.replyToCurrent || parsed.replyToCurrent, taggedPayload.replyToCurrent || parsed.replyToCurrent,
}); });
@@ -621,7 +673,9 @@ export async function runReplyAgent(params: {
}); });
})() })()
.catch((err) => { .catch((err) => {
logVerbose(`tool result delivery failed: ${String(err)}`); logVerbose(
`tool result delivery failed: ${String(err)}`,
);
}) })
.finally(() => { .finally(() => {
pendingToolTasks.delete(task); pendingToolTasks.delete(task);
@@ -635,16 +689,28 @@ export async function runReplyAgent(params: {
runResult = fallbackResult.result; runResult = fallbackResult.result;
fallbackProvider = fallbackResult.provider; fallbackProvider = fallbackResult.provider;
fallbackModel = fallbackResult.model; fallbackModel = fallbackResult.model;
break;
} catch (err) { } catch (err) {
const message = err instanceof Error ? err.message : String(err); const message = err instanceof Error ? err.message : String(err);
const isContextOverflow = const isContextOverflow =
isContextOverflowError(message) ||
/context.*overflow|too large|context window/i.test(message); /context.*overflow|too large|context window/i.test(message);
const isCompactionFailure = isCompactionFailureError(message);
const isSessionCorruption = const isSessionCorruption =
/function call turn comes immediately after/i.test(message); /function call turn comes immediately after/i.test(message);
if (
isCompactionFailure &&
!didResetAfterCompactionFailure &&
(await resetSessionAfterCompactionFailure(message))
) {
didResetAfterCompactionFailure = true;
continue;
}
// Auto-recover from Gemini session corruption by resetting the session // Auto-recover from Gemini session corruption by resetting the session
if (isSessionCorruption && sessionKey && sessionStore && storePath) { if (isSessionCorruption && sessionKey && activeSessionStore && storePath) {
const corruptedSessionId = sessionEntry?.sessionId; const corruptedSessionId = activeSessionEntry?.sessionId;
defaultRuntime.error( defaultRuntime.error(
`Session history corrupted (Gemini function call ordering). Resetting session: ${sessionKey}`, `Session history corrupted (Gemini function call ordering). Resetting session: ${sessionKey}`,
); );
@@ -662,8 +728,8 @@ export async function runReplyAgent(params: {
} }
// Remove session entry from store // Remove session entry from store
delete sessionStore[sessionKey]; delete activeSessionStore[sessionKey];
await saveSessionStore(storePath, sessionStore); await saveSessionStore(storePath, activeSessionStore);
} catch (cleanupErr) { } catch (cleanupErr) {
defaultRuntime.error( defaultRuntime.error(
`Failed to reset corrupted session ${sessionKey}: ${String(cleanupErr)}`, `Failed to reset corrupted session ${sessionKey}: ${String(cleanupErr)}`,
@@ -682,19 +748,20 @@ export async function runReplyAgent(params: {
: `⚠️ Agent failed before reply: ${message}. Check gateway logs for details.`, : `⚠️ Agent failed before reply: ${message}. Check gateway logs for details.`,
}); });
} }
}
if ( if (
shouldInjectGroupIntro && shouldInjectGroupIntro &&
sessionEntry && activeSessionEntry &&
sessionStore && activeSessionStore &&
sessionKey && sessionKey &&
sessionEntry.groupActivationNeedsSystemIntro activeSessionEntry.groupActivationNeedsSystemIntro
) { ) {
sessionEntry.groupActivationNeedsSystemIntro = false; activeSessionEntry.groupActivationNeedsSystemIntro = false;
sessionEntry.updatedAt = Date.now(); activeSessionEntry.updatedAt = Date.now();
sessionStore[sessionKey] = sessionEntry; activeSessionStore[sessionKey] = activeSessionEntry;
if (storePath) { if (storePath) {
await saveSessionStore(storePath, sessionStore); await saveSessionStore(storePath, activeSessionStore);
} }
} }
@@ -814,7 +881,7 @@ export async function runReplyAgent(params: {
const contextTokensUsed = const contextTokensUsed =
agentCfgContextTokens ?? agentCfgContextTokens ??
lookupContextTokens(modelUsed) ?? lookupContextTokens(modelUsed) ??
sessionEntry?.contextTokens ?? activeSessionEntry?.contextTokens ??
DEFAULT_CONTEXT_TOKENS; DEFAULT_CONTEXT_TOKENS;
if (storePath && sessionKey) { if (storePath && sessionKey) {
@@ -884,9 +951,9 @@ export async function runReplyAgent(params: {
} }
const responseUsageEnabled = const responseUsageEnabled =
(sessionEntry?.responseUsage ?? (activeSessionEntry?.responseUsage ??
(sessionKey (sessionKey
? sessionStore?.[sessionKey]?.responseUsage ? activeSessionStore?.[sessionKey]?.responseUsage
: undefined)) === "on"; : undefined)) === "on";
if (responseUsageEnabled && hasNonzeroUsage(usage)) { if (responseUsageEnabled && hasNonzeroUsage(usage)) {
const authMode = resolveModelAuthMode(providerUsed, cfg); const authMode = resolveModelAuthMode(providerUsed, cfg);
@@ -910,8 +977,8 @@ export async function runReplyAgent(params: {
let finalPayloads = replyPayloads; let finalPayloads = replyPayloads;
if (autoCompactionCompleted) { if (autoCompactionCompleted) {
const count = await incrementCompactionCount({ const count = await incrementCompactionCount({
sessionEntry, sessionEntry: activeSessionEntry,
sessionStore, sessionStore: activeSessionStore,
sessionKey, sessionKey,
storePath, storePath,
}); });
@@ -923,7 +990,7 @@ export async function runReplyAgent(params: {
]; ];
} }
} }
if (resolvedVerboseLevel === "on" && isNewSession) { if (resolvedVerboseLevel === "on" && activeIsNewSession) {
finalPayloads = [ finalPayloads = [
{ text: `🧭 New session: ${followupRun.run.sessionId}` }, { text: `🧭 New session: ${followupRun.run.sessionId}` },
...finalPayloads, ...finalPayloads,