refactor(agents): centralize failover handling

This commit is contained in:
Peter Steinberger
2026-01-09 21:31:13 +01:00
parent cfeaa34c16
commit 374aa856f2
7 changed files with 292 additions and 61 deletions

View File

@@ -151,6 +151,49 @@ describe("resolveAuthProfileOrder", () => {
expect(order).toEqual(["anthropic:work", "anthropic:default"]);
});
it("pushes disabled profiles to the end even with store order", () => {
const now = Date.now();
const order = resolveAuthProfileOrder({
store: {
...store,
order: { anthropic: ["anthropic:default", "anthropic:work"] },
usageStats: {
"anthropic:default": {
disabledUntil: now + 60_000,
disabledReason: "billing",
},
"anthropic:work": { lastUsed: 1 },
},
},
provider: "anthropic",
});
expect(order).toEqual(["anthropic:work", "anthropic:default"]);
});
it("pushes disabled profiles to the end even with configured order", () => {
const now = Date.now();
const order = resolveAuthProfileOrder({
cfg: {
auth: {
order: { anthropic: ["anthropic:default", "anthropic:work"] },
profiles: cfg.auth.profiles,
},
},
store: {
...store,
usageStats: {
"anthropic:default": {
disabledUntil: now + 60_000,
disabledReason: "billing",
},
"anthropic:work": { lastUsed: 1 },
},
},
provider: "anthropic",
});
expect(order).toEqual(["anthropic:work", "anthropic:default"]);
});
it("normalizes z.ai aliases in auth.order", () => {
const order = resolveAuthProfileOrder({
cfg: {

View File

@@ -72,11 +72,21 @@ export type AuthProfileCredential =
| TokenCredential
| OAuthCredential;
export type AuthProfileFailureReason =
| "auth"
| "rate_limit"
| "billing"
| "timeout"
| "unknown";
/** Per-profile usage statistics for round-robin and cooldown tracking */
export type ProfileUsageStats = {
lastUsed?: number;
cooldownUntil?: number;
disabledUntil?: number;
disabledReason?: AuthProfileFailureReason;
errorCount?: number;
failureCounts?: Partial<Record<AuthProfileFailureReason, number>>;
};
export type AuthProfileStore = {
@@ -772,8 +782,9 @@ export function isProfileInCooldown(
profileId: string,
): boolean {
const stats = store.usageStats?.[profileId];
if (!stats?.cooldownUntil) return false;
return Date.now() < stats.cooldownUntil;
if (!stats) return false;
const unusableUntil = resolveProfileUnusableUntil(stats);
return unusableUntil ? Date.now() < unusableUntil : false;
}
/**
@@ -796,6 +807,9 @@ export async function markAuthProfileUsed(params: {
lastUsed: Date.now(),
errorCount: 0,
cooldownUntil: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
};
return true;
},
@@ -812,6 +826,9 @@ export async function markAuthProfileUsed(params: {
lastUsed: Date.now(),
errorCount: 0,
cooldownUntil: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
};
saveAuthProfileStore(store, agentDir);
}
@@ -824,34 +841,74 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
);
}
function calculateAuthProfileBillingDisableMs(errorCount: number): number {
const normalized = Math.max(1, errorCount);
const steps = [
30 * 60 * 1000, // 30 min
2 * 60 * 60 * 1000, // 2 hours
8 * 60 * 60 * 1000, // 8 hours
24 * 60 * 60 * 1000, // 24 hours
];
return steps[Math.min(normalized - 1, steps.length - 1)] as number;
}
function resolveProfileUnusableUntil(stats: ProfileUsageStats): number | null {
const values = [stats.cooldownUntil, stats.disabledUntil]
.filter((value): value is number => typeof value === "number")
.filter((value) => Number.isFinite(value) && value > 0);
if (values.length === 0) return null;
return Math.max(...values);
}
export function resolveProfileUnusableUntilForDisplay(
store: AuthProfileStore,
profileId: string,
): number | null {
const stats = store.usageStats?.[profileId];
if (!stats) return null;
return resolveProfileUnusableUntil(stats);
}
/**
* Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
* Cooldown times: 1min, 5min, 25min, max 1 hour.
* Uses store lock to avoid overwriting concurrent usage updates.
* Mark a profile as failed for a specific reason. Billing failures are treated
* as "disabled" (longer backoff) vs the regular cooldown window.
*/
export async function markAuthProfileCooldown(params: {
export async function markAuthProfileFailure(params: {
store: AuthProfileStore;
profileId: string;
reason: AuthProfileFailureReason;
agentDir?: string;
}): Promise<void> {
const { store, profileId, agentDir } = params;
const { store, profileId, reason, agentDir } = params;
const updated = await updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
if (!freshStore.profiles[profileId]) return false;
freshStore.usageStats = freshStore.usageStats ?? {};
const existing = freshStore.usageStats[profileId] ?? {};
const errorCount = (existing.errorCount ?? 0) + 1;
// Exponential backoff: 1min, 5min, 25min, capped at 1h
const backoffMs = calculateAuthProfileCooldownMs(errorCount);
const nextErrorCount = (existing.errorCount ?? 0) + 1;
const failureCounts = { ...existing.failureCounts };
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
freshStore.usageStats[profileId] = {
const now = Date.now();
const updatedStats: ProfileUsageStats = {
...existing,
errorCount,
cooldownUntil: Date.now() + backoffMs,
errorCount: nextErrorCount,
failureCounts,
};
if (reason === "billing") {
const billingCount = failureCounts.billing ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
updatedStats.disabledUntil = now + backoffMs;
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
updatedStats.cooldownUntil = now + backoffMs;
}
freshStore.usageStats[profileId] = updatedStats;
return true;
},
});
@@ -863,19 +920,48 @@ export async function markAuthProfileCooldown(params: {
store.usageStats = store.usageStats ?? {};
const existing = store.usageStats[profileId] ?? {};
const errorCount = (existing.errorCount ?? 0) + 1;
const nextErrorCount = (existing.errorCount ?? 0) + 1;
const failureCounts = { ...existing.failureCounts };
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
// Exponential backoff: 1min, 5min, 25min, capped at 1h
const backoffMs = calculateAuthProfileCooldownMs(errorCount);
store.usageStats[profileId] = {
const now = Date.now();
const updatedStats: ProfileUsageStats = {
...existing,
errorCount,
cooldownUntil: Date.now() + backoffMs,
errorCount: nextErrorCount,
failureCounts,
};
if (reason === "billing") {
const billingCount = failureCounts.billing ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
updatedStats.disabledUntil = now + backoffMs;
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
updatedStats.cooldownUntil = now + backoffMs;
}
store.usageStats[profileId] = updatedStats;
saveAuthProfileStore(store, agentDir);
}
/**
* Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
* Cooldown times: 1min, 5min, 25min, max 1 hour.
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function markAuthProfileCooldown(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
}): Promise<void> {
await markAuthProfileFailure({
store: params.store,
profileId: params.profileId,
reason: "unknown",
agentDir: params.agentDir,
});
}
/**
* Clear cooldown for a profile (e.g., manual reset).
* Uses store lock to avoid overwriting concurrent usage updates.
@@ -973,7 +1059,8 @@ export function resolveAuthProfileOrder(params: {
const inCooldown: Array<{ profileId: string; cooldownUntil: number }> = [];
for (const profileId of deduped) {
const cooldownUntil = store.usageStats?.[profileId]?.cooldownUntil;
const cooldownUntil =
resolveProfileUnusableUntil(store.usageStats?.[profileId] ?? {}) ?? 0;
if (
typeof cooldownUntil === "number" &&
Number.isFinite(cooldownUntil) &&
@@ -1057,7 +1144,8 @@ function orderProfilesByMode(
const cooldownSorted = inCooldown
.map((profileId) => ({
profileId,
cooldownUntil: store.usageStats?.[profileId]?.cooldownUntil ?? now,
cooldownUntil:
resolveProfileUnusableUntil(store.usageStats?.[profileId] ?? {}) ?? now,
}))
.sort((a, b) => a.cooldownUntil - b.cooldownUntil)
.map((entry) => entry.profileId);

View File

@@ -56,6 +56,28 @@ describe("runWithModelFallback", () => {
expect(run.mock.calls[1]?.[1]).toBe("claude-haiku-3-5");
});
it("falls back on 402 payment required", async () => {
const cfg = makeCfg();
const run = vi
.fn()
.mockRejectedValueOnce(
Object.assign(new Error("payment required"), { status: 402 }),
)
.mockResolvedValueOnce("ok");
const result = await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-4.1-mini",
run,
});
expect(result.result).toBe("ok");
expect(run).toHaveBeenCalledTimes(2);
expect(run.mock.calls[1]?.[0]).toBe("anthropic");
expect(run.mock.calls[1]?.[1]).toBe("claude-haiku-3-5");
});
it("falls back on billing errors", async () => {
const cfg = makeCfg();
const run = vi

View File

@@ -7,11 +7,7 @@ import {
resolveConfiguredModelRef,
resolveModelRefFromString,
} from "./model-selection.js";
import {
isAuthErrorMessage,
isBillingErrorMessage,
isRateLimitErrorMessage,
} from "./pi-embedded-helpers.js";
import { isFailoverErrorMessage } from "./pi-embedded-helpers.js";
type ModelCandidate = {
provider: string;
@@ -71,16 +67,6 @@ function getErrorMessage(err: unknown): string {
return "";
}
function isTimeoutErrorMessage(raw: string): boolean {
const value = raw.toLowerCase();
return (
value.includes("timeout") ||
value.includes("timed out") ||
value.includes("deadline exceeded") ||
value.includes("context deadline exceeded")
);
}
function shouldFallbackForError(err: unknown): boolean {
const statusCode = getStatusCode(err);
if (statusCode && [401, 402, 403, 429].includes(statusCode)) return true;
@@ -94,12 +80,7 @@ function shouldFallbackForError(err: unknown): boolean {
}
const message = getErrorMessage(err);
if (!message) return false;
return (
isAuthErrorMessage(message) ||
isRateLimitErrorMessage(message) ||
isBillingErrorMessage(message) ||
isTimeoutErrorMessage(message)
);
return isFailoverErrorMessage(message);
}
function buildAllowedModelKeys(

View File

@@ -3,9 +3,11 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
import { describe, expect, it } from "vitest";
import {
buildBootstrapContextFiles,
classifyFailoverReason,
formatAssistantErrorText,
isBillingErrorMessage,
isContextOverflowError,
isFailoverErrorMessage,
isMessagingToolDuplicate,
normalizeTextForComparison,
sanitizeGoogleTurnOrdering,
@@ -238,6 +240,30 @@ describe("isBillingErrorMessage", () => {
});
});
describe("isFailoverErrorMessage", () => {
it("matches auth/rate/billing/timeout", () => {
const samples = [
"invalid api key",
"429 rate limit exceeded",
"Your credit balance is too low",
"request timed out",
];
for (const sample of samples) {
expect(isFailoverErrorMessage(sample)).toBe(true);
}
});
});
describe("classifyFailoverReason", () => {
it("returns a stable reason", () => {
expect(classifyFailoverReason("invalid api key")).toBe("auth");
expect(classifyFailoverReason("429 too many requests")).toBe("rate_limit");
expect(classifyFailoverReason("credit balance too low")).toBe("billing");
expect(classifyFailoverReason("deadline exceeded")).toBe("timeout");
expect(classifyFailoverReason("bad request")).toBeNull();
});
});
describe("formatAssistantErrorText", () => {
const makeAssistantError = (errorMessage: string): AssistantMessage =>
({

View File

@@ -261,6 +261,17 @@ export function isRateLimitErrorMessage(raw: string): boolean {
);
}
export function isTimeoutErrorMessage(raw: string): boolean {
const value = raw.toLowerCase();
if (!value) return false;
return (
value.includes("timeout") ||
value.includes("timed out") ||
value.includes("deadline exceeded") ||
value.includes("context deadline exceeded")
);
}
export function isBillingErrorMessage(raw: string): boolean {
const value = raw.toLowerCase();
if (!value) return false;
@@ -308,6 +319,32 @@ export function isAuthAssistantError(
return isAuthErrorMessage(msg.errorMessage ?? "");
}
export type FailoverReason =
| "auth"
| "rate_limit"
| "billing"
| "timeout"
| "unknown";
export function classifyFailoverReason(raw: string): FailoverReason | null {
if (isAuthErrorMessage(raw)) return "auth";
if (isRateLimitErrorMessage(raw)) return "rate_limit";
if (isBillingErrorMessage(raw)) return "billing";
if (isTimeoutErrorMessage(raw)) return "timeout";
return null;
}
export function isFailoverErrorMessage(raw: string): boolean {
return classifyFailoverReason(raw) !== null;
}
export function isFailoverAssistantError(
msg: AssistantMessage | undefined,
): boolean {
if (!msg || msg.stopReason !== "error") return false;
return isFailoverErrorMessage(msg.errorMessage ?? "");
}
function extractSupportedValues(raw: string): string[] {
const match =
raw.match(/supported values are:\s*([^\n.]+)/i) ??

View File

@@ -37,7 +37,7 @@ import { normalizeMessageProvider } from "../utils/message-provider.js";
import { resolveUserPath } from "../utils.js";
import { resolveClawdbotAgentDir } from "./agent-paths.js";
import {
markAuthProfileCooldown,
markAuthProfileFailure,
markAuthProfileGood,
markAuthProfileUsed,
} from "./auth-profiles.js";
@@ -55,17 +55,17 @@ import {
import { ensureClawdbotModelsJson } from "./models-config.js";
import {
buildBootstrapContextFiles,
classifyFailoverReason,
type EmbeddedContextFile,
ensureSessionHeader,
formatAssistantErrorText,
isAuthAssistantError,
isAuthErrorMessage,
isBillingAssistantError,
isBillingErrorMessage,
isContextOverflowError,
isFailoverAssistantError,
isFailoverErrorMessage,
isGoogleModelApi,
isRateLimitAssistantError,
isRateLimitErrorMessage,
isTimeoutErrorMessage,
pickFallbackThinkingLevel,
sanitizeGoogleTurnOrdering,
sanitizeSessionMessagesImages,
@@ -1438,10 +1438,22 @@ export async function runEmbeddedPiAgent(params: {
},
};
}
const promptFailoverReason = classifyFailoverReason(errorText);
if (
(isAuthErrorMessage(errorText) ||
isRateLimitErrorMessage(errorText) ||
isBillingErrorMessage(errorText)) &&
promptFailoverReason &&
promptFailoverReason !== "timeout" &&
lastProfileId
) {
await markAuthProfileFailure({
store: authStore,
profileId: lastProfileId,
reason: promptFailoverReason,
agentDir: params.agentDir,
});
}
if (
isFailoverErrorMessage(errorText) &&
promptFailoverReason !== "timeout" &&
(await advanceAuthProfile())
) {
continue;
@@ -1484,19 +1496,26 @@ export async function runEmbeddedPiAgent(params: {
0;
const authFailure = isAuthAssistantError(lastAssistant);
const rateLimitFailure = isRateLimitAssistantError(lastAssistant);
const billingFailure = isBillingAssistantError(lastAssistant);
const failoverFailure = isFailoverAssistantError(lastAssistant);
const assistantFailoverReason = classifyFailoverReason(
lastAssistant?.errorMessage ?? "",
);
// Treat timeout as potential rate limit (Antigravity hangs on rate limit)
const shouldRotate =
(!aborted && (authFailure || rateLimitFailure || billingFailure)) ||
timedOut;
const shouldRotate = (!aborted && failoverFailure) || timedOut;
if (shouldRotate) {
// Mark current profile for cooldown before rotating
if (lastProfileId) {
await markAuthProfileCooldown({
const reason =
timedOut || assistantFailoverReason === "timeout"
? "timeout"
: (assistantFailoverReason ?? "unknown");
await markAuthProfileFailure({
store: authStore,
profileId: lastProfileId,
reason,
agentDir: params.agentDir,
});
if (timedOut) {
log.warn(
@@ -1518,10 +1537,25 @@ export async function runEmbeddedPiAgent(params: {
? "LLM request timed out."
: rateLimitFailure
? "LLM request rate limited."
: billingFailure
? "LLM request payment required."
: "LLM request unauthorized.");
throw new Error(message);
: authFailure
? "LLM request unauthorized."
: "LLM request failed.");
const err = new Error(message);
(err as { failoverReason?: string }).failoverReason =
assistantFailoverReason ?? undefined;
if (assistantFailoverReason === "billing") {
(err as { status?: number }).status = 402;
} else if (assistantFailoverReason === "rate_limit") {
(err as { status?: number }).status = 429;
} else if (assistantFailoverReason === "auth") {
(err as { status?: number }).status = 401;
} else if (
assistantFailoverReason === "timeout" ||
isTimeoutErrorMessage(message)
) {
(err as { status?: number }).status = 408;
}
throw err;
}
}