refactor(agents): centralize failover handling
This commit is contained in:
@@ -151,6 +151,49 @@ describe("resolveAuthProfileOrder", () => {
|
||||
expect(order).toEqual(["anthropic:work", "anthropic:default"]);
|
||||
});
|
||||
|
||||
it("pushes disabled profiles to the end even with store order", () => {
|
||||
const now = Date.now();
|
||||
const order = resolveAuthProfileOrder({
|
||||
store: {
|
||||
...store,
|
||||
order: { anthropic: ["anthropic:default", "anthropic:work"] },
|
||||
usageStats: {
|
||||
"anthropic:default": {
|
||||
disabledUntil: now + 60_000,
|
||||
disabledReason: "billing",
|
||||
},
|
||||
"anthropic:work": { lastUsed: 1 },
|
||||
},
|
||||
},
|
||||
provider: "anthropic",
|
||||
});
|
||||
expect(order).toEqual(["anthropic:work", "anthropic:default"]);
|
||||
});
|
||||
|
||||
it("pushes disabled profiles to the end even with configured order", () => {
|
||||
const now = Date.now();
|
||||
const order = resolveAuthProfileOrder({
|
||||
cfg: {
|
||||
auth: {
|
||||
order: { anthropic: ["anthropic:default", "anthropic:work"] },
|
||||
profiles: cfg.auth.profiles,
|
||||
},
|
||||
},
|
||||
store: {
|
||||
...store,
|
||||
usageStats: {
|
||||
"anthropic:default": {
|
||||
disabledUntil: now + 60_000,
|
||||
disabledReason: "billing",
|
||||
},
|
||||
"anthropic:work": { lastUsed: 1 },
|
||||
},
|
||||
},
|
||||
provider: "anthropic",
|
||||
});
|
||||
expect(order).toEqual(["anthropic:work", "anthropic:default"]);
|
||||
});
|
||||
|
||||
it("normalizes z.ai aliases in auth.order", () => {
|
||||
const order = resolveAuthProfileOrder({
|
||||
cfg: {
|
||||
|
||||
@@ -72,11 +72,21 @@ export type AuthProfileCredential =
|
||||
| TokenCredential
|
||||
| OAuthCredential;
|
||||
|
||||
export type AuthProfileFailureReason =
|
||||
| "auth"
|
||||
| "rate_limit"
|
||||
| "billing"
|
||||
| "timeout"
|
||||
| "unknown";
|
||||
|
||||
/** Per-profile usage statistics for round-robin and cooldown tracking */
|
||||
export type ProfileUsageStats = {
|
||||
lastUsed?: number;
|
||||
cooldownUntil?: number;
|
||||
disabledUntil?: number;
|
||||
disabledReason?: AuthProfileFailureReason;
|
||||
errorCount?: number;
|
||||
failureCounts?: Partial<Record<AuthProfileFailureReason, number>>;
|
||||
};
|
||||
|
||||
export type AuthProfileStore = {
|
||||
@@ -772,8 +782,9 @@ export function isProfileInCooldown(
|
||||
profileId: string,
|
||||
): boolean {
|
||||
const stats = store.usageStats?.[profileId];
|
||||
if (!stats?.cooldownUntil) return false;
|
||||
return Date.now() < stats.cooldownUntil;
|
||||
if (!stats) return false;
|
||||
const unusableUntil = resolveProfileUnusableUntil(stats);
|
||||
return unusableUntil ? Date.now() < unusableUntil : false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -796,6 +807,9 @@ export async function markAuthProfileUsed(params: {
|
||||
lastUsed: Date.now(),
|
||||
errorCount: 0,
|
||||
cooldownUntil: undefined,
|
||||
disabledUntil: undefined,
|
||||
disabledReason: undefined,
|
||||
failureCounts: undefined,
|
||||
};
|
||||
return true;
|
||||
},
|
||||
@@ -812,6 +826,9 @@ export async function markAuthProfileUsed(params: {
|
||||
lastUsed: Date.now(),
|
||||
errorCount: 0,
|
||||
cooldownUntil: undefined,
|
||||
disabledUntil: undefined,
|
||||
disabledReason: undefined,
|
||||
failureCounts: undefined,
|
||||
};
|
||||
saveAuthProfileStore(store, agentDir);
|
||||
}
|
||||
@@ -824,34 +841,74 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
|
||||
);
|
||||
}
|
||||
|
||||
function calculateAuthProfileBillingDisableMs(errorCount: number): number {
|
||||
const normalized = Math.max(1, errorCount);
|
||||
const steps = [
|
||||
30 * 60 * 1000, // 30 min
|
||||
2 * 60 * 60 * 1000, // 2 hours
|
||||
8 * 60 * 60 * 1000, // 8 hours
|
||||
24 * 60 * 60 * 1000, // 24 hours
|
||||
];
|
||||
return steps[Math.min(normalized - 1, steps.length - 1)] as number;
|
||||
}
|
||||
|
||||
function resolveProfileUnusableUntil(stats: ProfileUsageStats): number | null {
|
||||
const values = [stats.cooldownUntil, stats.disabledUntil]
|
||||
.filter((value): value is number => typeof value === "number")
|
||||
.filter((value) => Number.isFinite(value) && value > 0);
|
||||
if (values.length === 0) return null;
|
||||
return Math.max(...values);
|
||||
}
|
||||
|
||||
export function resolveProfileUnusableUntilForDisplay(
|
||||
store: AuthProfileStore,
|
||||
profileId: string,
|
||||
): number | null {
|
||||
const stats = store.usageStats?.[profileId];
|
||||
if (!stats) return null;
|
||||
return resolveProfileUnusableUntil(stats);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
|
||||
* Cooldown times: 1min, 5min, 25min, max 1 hour.
|
||||
* Uses store lock to avoid overwriting concurrent usage updates.
|
||||
* Mark a profile as failed for a specific reason. Billing failures are treated
|
||||
* as "disabled" (longer backoff) vs the regular cooldown window.
|
||||
*/
|
||||
export async function markAuthProfileCooldown(params: {
|
||||
export async function markAuthProfileFailure(params: {
|
||||
store: AuthProfileStore;
|
||||
profileId: string;
|
||||
reason: AuthProfileFailureReason;
|
||||
agentDir?: string;
|
||||
}): Promise<void> {
|
||||
const { store, profileId, agentDir } = params;
|
||||
const { store, profileId, reason, agentDir } = params;
|
||||
const updated = await updateAuthProfileStoreWithLock({
|
||||
agentDir,
|
||||
updater: (freshStore) => {
|
||||
if (!freshStore.profiles[profileId]) return false;
|
||||
|
||||
freshStore.usageStats = freshStore.usageStats ?? {};
|
||||
const existing = freshStore.usageStats[profileId] ?? {};
|
||||
const errorCount = (existing.errorCount ?? 0) + 1;
|
||||
|
||||
// Exponential backoff: 1min, 5min, 25min, capped at 1h
|
||||
const backoffMs = calculateAuthProfileCooldownMs(errorCount);
|
||||
const nextErrorCount = (existing.errorCount ?? 0) + 1;
|
||||
const failureCounts = { ...existing.failureCounts };
|
||||
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
|
||||
|
||||
freshStore.usageStats[profileId] = {
|
||||
const now = Date.now();
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...existing,
|
||||
errorCount,
|
||||
cooldownUntil: Date.now() + backoffMs,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
};
|
||||
|
||||
if (reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
|
||||
updatedStats.disabledUntil = now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
updatedStats.cooldownUntil = now + backoffMs;
|
||||
}
|
||||
|
||||
freshStore.usageStats[profileId] = updatedStats;
|
||||
return true;
|
||||
},
|
||||
});
|
||||
@@ -863,19 +920,48 @@ export async function markAuthProfileCooldown(params: {
|
||||
|
||||
store.usageStats = store.usageStats ?? {};
|
||||
const existing = store.usageStats[profileId] ?? {};
|
||||
const errorCount = (existing.errorCount ?? 0) + 1;
|
||||
const nextErrorCount = (existing.errorCount ?? 0) + 1;
|
||||
const failureCounts = { ...existing.failureCounts };
|
||||
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
|
||||
|
||||
// Exponential backoff: 1min, 5min, 25min, capped at 1h
|
||||
const backoffMs = calculateAuthProfileCooldownMs(errorCount);
|
||||
|
||||
store.usageStats[profileId] = {
|
||||
const now = Date.now();
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...existing,
|
||||
errorCount,
|
||||
cooldownUntil: Date.now() + backoffMs,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
};
|
||||
if (reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
|
||||
updatedStats.disabledUntil = now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
updatedStats.cooldownUntil = now + backoffMs;
|
||||
}
|
||||
|
||||
store.usageStats[profileId] = updatedStats;
|
||||
saveAuthProfileStore(store, agentDir);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
|
||||
* Cooldown times: 1min, 5min, 25min, max 1 hour.
|
||||
* Uses store lock to avoid overwriting concurrent usage updates.
|
||||
*/
|
||||
export async function markAuthProfileCooldown(params: {
|
||||
store: AuthProfileStore;
|
||||
profileId: string;
|
||||
agentDir?: string;
|
||||
}): Promise<void> {
|
||||
await markAuthProfileFailure({
|
||||
store: params.store,
|
||||
profileId: params.profileId,
|
||||
reason: "unknown",
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear cooldown for a profile (e.g., manual reset).
|
||||
* Uses store lock to avoid overwriting concurrent usage updates.
|
||||
@@ -973,7 +1059,8 @@ export function resolveAuthProfileOrder(params: {
|
||||
const inCooldown: Array<{ profileId: string; cooldownUntil: number }> = [];
|
||||
|
||||
for (const profileId of deduped) {
|
||||
const cooldownUntil = store.usageStats?.[profileId]?.cooldownUntil;
|
||||
const cooldownUntil =
|
||||
resolveProfileUnusableUntil(store.usageStats?.[profileId] ?? {}) ?? 0;
|
||||
if (
|
||||
typeof cooldownUntil === "number" &&
|
||||
Number.isFinite(cooldownUntil) &&
|
||||
@@ -1057,7 +1144,8 @@ function orderProfilesByMode(
|
||||
const cooldownSorted = inCooldown
|
||||
.map((profileId) => ({
|
||||
profileId,
|
||||
cooldownUntil: store.usageStats?.[profileId]?.cooldownUntil ?? now,
|
||||
cooldownUntil:
|
||||
resolveProfileUnusableUntil(store.usageStats?.[profileId] ?? {}) ?? now,
|
||||
}))
|
||||
.sort((a, b) => a.cooldownUntil - b.cooldownUntil)
|
||||
.map((entry) => entry.profileId);
|
||||
|
||||
@@ -56,6 +56,28 @@ describe("runWithModelFallback", () => {
|
||||
expect(run.mock.calls[1]?.[1]).toBe("claude-haiku-3-5");
|
||||
});
|
||||
|
||||
it("falls back on 402 payment required", async () => {
|
||||
const cfg = makeCfg();
|
||||
const run = vi
|
||||
.fn()
|
||||
.mockRejectedValueOnce(
|
||||
Object.assign(new Error("payment required"), { status: 402 }),
|
||||
)
|
||||
.mockResolvedValueOnce("ok");
|
||||
|
||||
const result = await runWithModelFallback({
|
||||
cfg,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1-mini",
|
||||
run,
|
||||
});
|
||||
|
||||
expect(result.result).toBe("ok");
|
||||
expect(run).toHaveBeenCalledTimes(2);
|
||||
expect(run.mock.calls[1]?.[0]).toBe("anthropic");
|
||||
expect(run.mock.calls[1]?.[1]).toBe("claude-haiku-3-5");
|
||||
});
|
||||
|
||||
it("falls back on billing errors", async () => {
|
||||
const cfg = makeCfg();
|
||||
const run = vi
|
||||
|
||||
@@ -7,11 +7,7 @@ import {
|
||||
resolveConfiguredModelRef,
|
||||
resolveModelRefFromString,
|
||||
} from "./model-selection.js";
|
||||
import {
|
||||
isAuthErrorMessage,
|
||||
isBillingErrorMessage,
|
||||
isRateLimitErrorMessage,
|
||||
} from "./pi-embedded-helpers.js";
|
||||
import { isFailoverErrorMessage } from "./pi-embedded-helpers.js";
|
||||
|
||||
type ModelCandidate = {
|
||||
provider: string;
|
||||
@@ -71,16 +67,6 @@ function getErrorMessage(err: unknown): string {
|
||||
return "";
|
||||
}
|
||||
|
||||
function isTimeoutErrorMessage(raw: string): boolean {
|
||||
const value = raw.toLowerCase();
|
||||
return (
|
||||
value.includes("timeout") ||
|
||||
value.includes("timed out") ||
|
||||
value.includes("deadline exceeded") ||
|
||||
value.includes("context deadline exceeded")
|
||||
);
|
||||
}
|
||||
|
||||
function shouldFallbackForError(err: unknown): boolean {
|
||||
const statusCode = getStatusCode(err);
|
||||
if (statusCode && [401, 402, 403, 429].includes(statusCode)) return true;
|
||||
@@ -94,12 +80,7 @@ function shouldFallbackForError(err: unknown): boolean {
|
||||
}
|
||||
const message = getErrorMessage(err);
|
||||
if (!message) return false;
|
||||
return (
|
||||
isAuthErrorMessage(message) ||
|
||||
isRateLimitErrorMessage(message) ||
|
||||
isBillingErrorMessage(message) ||
|
||||
isTimeoutErrorMessage(message)
|
||||
);
|
||||
return isFailoverErrorMessage(message);
|
||||
}
|
||||
|
||||
function buildAllowedModelKeys(
|
||||
|
||||
@@ -3,9 +3,11 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildBootstrapContextFiles,
|
||||
classifyFailoverReason,
|
||||
formatAssistantErrorText,
|
||||
isBillingErrorMessage,
|
||||
isContextOverflowError,
|
||||
isFailoverErrorMessage,
|
||||
isMessagingToolDuplicate,
|
||||
normalizeTextForComparison,
|
||||
sanitizeGoogleTurnOrdering,
|
||||
@@ -238,6 +240,30 @@ describe("isBillingErrorMessage", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("isFailoverErrorMessage", () => {
|
||||
it("matches auth/rate/billing/timeout", () => {
|
||||
const samples = [
|
||||
"invalid api key",
|
||||
"429 rate limit exceeded",
|
||||
"Your credit balance is too low",
|
||||
"request timed out",
|
||||
];
|
||||
for (const sample of samples) {
|
||||
expect(isFailoverErrorMessage(sample)).toBe(true);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("classifyFailoverReason", () => {
|
||||
it("returns a stable reason", () => {
|
||||
expect(classifyFailoverReason("invalid api key")).toBe("auth");
|
||||
expect(classifyFailoverReason("429 too many requests")).toBe("rate_limit");
|
||||
expect(classifyFailoverReason("credit balance too low")).toBe("billing");
|
||||
expect(classifyFailoverReason("deadline exceeded")).toBe("timeout");
|
||||
expect(classifyFailoverReason("bad request")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("formatAssistantErrorText", () => {
|
||||
const makeAssistantError = (errorMessage: string): AssistantMessage =>
|
||||
({
|
||||
|
||||
@@ -261,6 +261,17 @@ export function isRateLimitErrorMessage(raw: string): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
export function isTimeoutErrorMessage(raw: string): boolean {
|
||||
const value = raw.toLowerCase();
|
||||
if (!value) return false;
|
||||
return (
|
||||
value.includes("timeout") ||
|
||||
value.includes("timed out") ||
|
||||
value.includes("deadline exceeded") ||
|
||||
value.includes("context deadline exceeded")
|
||||
);
|
||||
}
|
||||
|
||||
export function isBillingErrorMessage(raw: string): boolean {
|
||||
const value = raw.toLowerCase();
|
||||
if (!value) return false;
|
||||
@@ -308,6 +319,32 @@ export function isAuthAssistantError(
|
||||
return isAuthErrorMessage(msg.errorMessage ?? "");
|
||||
}
|
||||
|
||||
export type FailoverReason =
|
||||
| "auth"
|
||||
| "rate_limit"
|
||||
| "billing"
|
||||
| "timeout"
|
||||
| "unknown";
|
||||
|
||||
export function classifyFailoverReason(raw: string): FailoverReason | null {
|
||||
if (isAuthErrorMessage(raw)) return "auth";
|
||||
if (isRateLimitErrorMessage(raw)) return "rate_limit";
|
||||
if (isBillingErrorMessage(raw)) return "billing";
|
||||
if (isTimeoutErrorMessage(raw)) return "timeout";
|
||||
return null;
|
||||
}
|
||||
|
||||
export function isFailoverErrorMessage(raw: string): boolean {
|
||||
return classifyFailoverReason(raw) !== null;
|
||||
}
|
||||
|
||||
export function isFailoverAssistantError(
|
||||
msg: AssistantMessage | undefined,
|
||||
): boolean {
|
||||
if (!msg || msg.stopReason !== "error") return false;
|
||||
return isFailoverErrorMessage(msg.errorMessage ?? "");
|
||||
}
|
||||
|
||||
function extractSupportedValues(raw: string): string[] {
|
||||
const match =
|
||||
raw.match(/supported values are:\s*([^\n.]+)/i) ??
|
||||
|
||||
@@ -37,7 +37,7 @@ import { normalizeMessageProvider } from "../utils/message-provider.js";
|
||||
import { resolveUserPath } from "../utils.js";
|
||||
import { resolveClawdbotAgentDir } from "./agent-paths.js";
|
||||
import {
|
||||
markAuthProfileCooldown,
|
||||
markAuthProfileFailure,
|
||||
markAuthProfileGood,
|
||||
markAuthProfileUsed,
|
||||
} from "./auth-profiles.js";
|
||||
@@ -55,17 +55,17 @@ import {
|
||||
import { ensureClawdbotModelsJson } from "./models-config.js";
|
||||
import {
|
||||
buildBootstrapContextFiles,
|
||||
classifyFailoverReason,
|
||||
type EmbeddedContextFile,
|
||||
ensureSessionHeader,
|
||||
formatAssistantErrorText,
|
||||
isAuthAssistantError,
|
||||
isAuthErrorMessage,
|
||||
isBillingAssistantError,
|
||||
isBillingErrorMessage,
|
||||
isContextOverflowError,
|
||||
isFailoverAssistantError,
|
||||
isFailoverErrorMessage,
|
||||
isGoogleModelApi,
|
||||
isRateLimitAssistantError,
|
||||
isRateLimitErrorMessage,
|
||||
isTimeoutErrorMessage,
|
||||
pickFallbackThinkingLevel,
|
||||
sanitizeGoogleTurnOrdering,
|
||||
sanitizeSessionMessagesImages,
|
||||
@@ -1438,10 +1438,22 @@ export async function runEmbeddedPiAgent(params: {
|
||||
},
|
||||
};
|
||||
}
|
||||
const promptFailoverReason = classifyFailoverReason(errorText);
|
||||
if (
|
||||
(isAuthErrorMessage(errorText) ||
|
||||
isRateLimitErrorMessage(errorText) ||
|
||||
isBillingErrorMessage(errorText)) &&
|
||||
promptFailoverReason &&
|
||||
promptFailoverReason !== "timeout" &&
|
||||
lastProfileId
|
||||
) {
|
||||
await markAuthProfileFailure({
|
||||
store: authStore,
|
||||
profileId: lastProfileId,
|
||||
reason: promptFailoverReason,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
}
|
||||
if (
|
||||
isFailoverErrorMessage(errorText) &&
|
||||
promptFailoverReason !== "timeout" &&
|
||||
(await advanceAuthProfile())
|
||||
) {
|
||||
continue;
|
||||
@@ -1484,19 +1496,26 @@ export async function runEmbeddedPiAgent(params: {
|
||||
0;
|
||||
const authFailure = isAuthAssistantError(lastAssistant);
|
||||
const rateLimitFailure = isRateLimitAssistantError(lastAssistant);
|
||||
const billingFailure = isBillingAssistantError(lastAssistant);
|
||||
const failoverFailure = isFailoverAssistantError(lastAssistant);
|
||||
const assistantFailoverReason = classifyFailoverReason(
|
||||
lastAssistant?.errorMessage ?? "",
|
||||
);
|
||||
|
||||
// Treat timeout as potential rate limit (Antigravity hangs on rate limit)
|
||||
const shouldRotate =
|
||||
(!aborted && (authFailure || rateLimitFailure || billingFailure)) ||
|
||||
timedOut;
|
||||
const shouldRotate = (!aborted && failoverFailure) || timedOut;
|
||||
|
||||
if (shouldRotate) {
|
||||
// Mark current profile for cooldown before rotating
|
||||
if (lastProfileId) {
|
||||
await markAuthProfileCooldown({
|
||||
const reason =
|
||||
timedOut || assistantFailoverReason === "timeout"
|
||||
? "timeout"
|
||||
: (assistantFailoverReason ?? "unknown");
|
||||
await markAuthProfileFailure({
|
||||
store: authStore,
|
||||
profileId: lastProfileId,
|
||||
reason,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
if (timedOut) {
|
||||
log.warn(
|
||||
@@ -1518,10 +1537,25 @@ export async function runEmbeddedPiAgent(params: {
|
||||
? "LLM request timed out."
|
||||
: rateLimitFailure
|
||||
? "LLM request rate limited."
|
||||
: billingFailure
|
||||
? "LLM request payment required."
|
||||
: "LLM request unauthorized.");
|
||||
throw new Error(message);
|
||||
: authFailure
|
||||
? "LLM request unauthorized."
|
||||
: "LLM request failed.");
|
||||
const err = new Error(message);
|
||||
(err as { failoverReason?: string }).failoverReason =
|
||||
assistantFailoverReason ?? undefined;
|
||||
if (assistantFailoverReason === "billing") {
|
||||
(err as { status?: number }).status = 402;
|
||||
} else if (assistantFailoverReason === "rate_limit") {
|
||||
(err as { status?: number }).status = 429;
|
||||
} else if (assistantFailoverReason === "auth") {
|
||||
(err as { status?: number }).status = 401;
|
||||
} else if (
|
||||
assistantFailoverReason === "timeout" ||
|
||||
isTimeoutErrorMessage(message)
|
||||
) {
|
||||
(err as { status?: number }).status = 408;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user