refactor(agents): centralize failover normalization

This commit is contained in:
Peter Steinberger
2026-01-09 22:15:03 +01:00
parent 6220106ab2
commit 402c35b91c
4 changed files with 290 additions and 186 deletions

View File

@@ -842,6 +842,57 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
);
}
type ResolvedAuthCooldownConfig = {
billingBackoffMs: number;
billingMaxMs: number;
failureWindowMs: number;
};
function resolveAuthCooldownConfig(params: {
cfg?: ClawdbotConfig;
providerId: string;
}): ResolvedAuthCooldownConfig {
const defaults = {
billingBackoffHours: 5,
billingMaxHours: 24,
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0
? value
: fallback;
const cooldowns = params.cfg?.auth?.cooldowns;
const billingOverride = (() => {
const map = cooldowns?.billingBackoffHoursByProvider;
if (!map) return undefined;
for (const [key, value] of Object.entries(map)) {
if (normalizeProviderId(key) === params.providerId) return value;
}
return undefined;
})();
const billingBackoffHours = resolveHours(
billingOverride ?? cooldowns?.billingBackoffHours,
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(
cooldowns?.billingMaxHours,
defaults.billingMaxHours,
);
const failureWindowHours = resolveHours(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
};
}
function calculateAuthProfileBillingDisableMsWithConfig(params: {
errorCount: number;
baseMs: number;
@@ -872,6 +923,49 @@ export function resolveProfileUnusableUntilForDisplay(
return resolveProfileUnusableUntil(stats);
}
function computeNextProfileUsageStats(params: {
existing: ProfileUsageStats;
now: number;
reason: AuthProfileFailureReason;
cfgResolved: ResolvedAuthCooldownConfig;
}): ProfileUsageStats {
const windowMs = params.cfgResolved.failureWindowMs;
const windowExpired =
typeof params.existing.lastFailureAt === "number" &&
params.existing.lastFailureAt > 0 &&
params.now - params.existing.lastFailureAt > windowMs;
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
const nextErrorCount = baseErrorCount + 1;
const failureCounts = windowExpired
? {}
: { ...params.existing.failureCounts };
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
const updatedStats: ProfileUsageStats = {
...params.existing,
errorCount: nextErrorCount,
failureCounts,
lastFailureAt: params.now,
};
if (params.reason === "billing") {
const billingCount = failureCounts.billing ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
baseMs: params.cfgResolved.billingBackoffMs,
maxMs: params.cfgResolved.billingMaxMs,
});
updatedStats.disabledUntil = params.now + backoffMs;
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
updatedStats.cooldownUntil = params.now + backoffMs;
}
return updatedStats;
}
/**
* Mark a profile as failed for a specific reason. Billing failures are treated
* as "disabled" (longer backoff) vs the regular cooldown window.
@@ -884,44 +978,6 @@ export async function markAuthProfileFailure(params: {
agentDir?: string;
}): Promise<void> {
const { store, profileId, reason, agentDir, cfg } = params;
const defaults = {
billingBackoffHours: 5,
billingMaxHours: 24,
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0
? value
: fallback;
const resolveCooldownConfig = (providerId: string) => {
const cooldowns = cfg?.auth?.cooldowns;
const billingOverride = (() => {
const map = cooldowns?.billingBackoffHoursByProvider;
if (!map) return undefined;
for (const [key, value] of Object.entries(map)) {
if (normalizeProviderId(key) === providerId) return value;
}
return undefined;
})();
const billingBackoffHours = resolveHours(
billingOverride ?? cooldowns?.billingBackoffHours,
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(
cooldowns?.billingMaxHours,
defaults.billingMaxHours,
);
const failureWindowHours = resolveHours(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
};
};
const updated = await updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
@@ -932,41 +988,17 @@ export async function markAuthProfileFailure(params: {
const now = Date.now();
const providerKey = normalizeProviderId(profile.provider);
const cfgResolved = resolveCooldownConfig(providerKey);
const cfgResolved = resolveAuthCooldownConfig({
cfg,
providerId: providerKey,
});
const windowMs = cfgResolved.failureWindowMs;
const windowExpired =
typeof existing.lastFailureAt === "number" &&
existing.lastFailureAt > 0 &&
now - existing.lastFailureAt > windowMs;
const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0);
const nextErrorCount = baseErrorCount + 1;
const failureCounts = windowExpired ? {} : { ...existing.failureCounts };
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
const updatedStats: ProfileUsageStats = {
...existing,
errorCount: nextErrorCount,
failureCounts,
lastFailureAt: now,
};
if (reason === "billing") {
const billingCount = failureCounts.billing ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
baseMs: cfgResolved.billingBackoffMs,
maxMs: cfgResolved.billingMaxMs,
});
updatedStats.disabledUntil = now + backoffMs;
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
updatedStats.cooldownUntil = now + backoffMs;
}
freshStore.usageStats[profileId] = updatedStats;
freshStore.usageStats[profileId] = computeNextProfileUsageStats({
existing,
now,
reason,
cfgResolved,
});
return true;
},
});
@@ -982,38 +1014,17 @@ export async function markAuthProfileFailure(params: {
const providerKey = normalizeProviderId(
store.profiles[profileId]?.provider ?? "",
);
const cfgResolved = resolveCooldownConfig(providerKey);
const windowMs = cfgResolved.failureWindowMs;
const windowExpired =
typeof existing.lastFailureAt === "number" &&
existing.lastFailureAt > 0 &&
now - existing.lastFailureAt > windowMs;
const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0);
const nextErrorCount = baseErrorCount + 1;
const failureCounts = windowExpired ? {} : { ...existing.failureCounts };
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
const cfgResolved = resolveAuthCooldownConfig({
cfg,
providerId: providerKey,
});
const updatedStats: ProfileUsageStats = {
...existing,
errorCount: nextErrorCount,
failureCounts,
lastFailureAt: now,
};
if (reason === "billing") {
const billingCount = failureCounts.billing ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
baseMs: cfgResolved.billingBackoffMs,
maxMs: cfgResolved.billingMaxMs,
});
updatedStats.disabledUntil = now + backoffMs;
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
updatedStats.cooldownUntil = now + backoffMs;
}
store.usageStats[profileId] = updatedStats;
store.usageStats[profileId] = computeNextProfileUsageStats({
existing,
now,
reason,
cfgResolved,
});
saveAuthProfileStore(store, agentDir);
}

View File

@@ -0,0 +1,44 @@
import { describe, expect, it } from "vitest";
import {
coerceToFailoverError,
describeFailoverError,
resolveFailoverReasonFromError,
} from "./failover-error.js";
describe("failover-error", () => {
it("infers failover reason from HTTP status", () => {
expect(resolveFailoverReasonFromError({ status: 402 })).toBe("billing");
expect(resolveFailoverReasonFromError({ statusCode: "429" })).toBe(
"rate_limit",
);
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
});
it("infers timeout from common node error codes", () => {
expect(resolveFailoverReasonFromError({ code: "ETIMEDOUT" })).toBe(
"timeout",
);
expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe(
"timeout",
);
});
it("coerces failover-worthy errors into FailoverError with metadata", () => {
const err = coerceToFailoverError("credit balance too low", {
provider: "anthropic",
model: "claude-opus-4-5",
});
expect(err?.name).toBe("FailoverError");
expect(err?.reason).toBe("billing");
expect(err?.status).toBe(402);
expect(err?.provider).toBe("anthropic");
expect(err?.model).toBe("claude-opus-4-5");
});
it("describes non-Error values consistently", () => {
const described = describeFailoverError(123);
expect(described.message).toBe("123");
expect(described.reason).toBeUndefined();
});
});

View File

@@ -1,4 +1,7 @@
import type { FailoverReason } from "./pi-embedded-helpers.js";
import {
classifyFailoverReason,
type FailoverReason,
} from "./pi-embedded-helpers.js";
export class FailoverError extends Error {
readonly reason: FailoverReason;
@@ -51,3 +54,116 @@ export function resolveFailoverStatus(
return undefined;
}
}
function getStatusCode(err: unknown): number | undefined {
if (!err || typeof err !== "object") return undefined;
const candidate =
(err as { status?: unknown; statusCode?: unknown }).status ??
(err as { statusCode?: unknown }).statusCode;
if (typeof candidate === "number") return candidate;
if (typeof candidate === "string" && /^\d+$/.test(candidate)) {
return Number(candidate);
}
return undefined;
}
function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") return undefined;
const candidate = (err as { code?: unknown }).code;
if (typeof candidate !== "string") return undefined;
const trimmed = candidate.trim();
return trimmed ? trimmed : undefined;
}
function getErrorMessage(err: unknown): string {
if (err instanceof Error) return err.message;
if (typeof err === "string") return err;
if (
typeof err === "number" ||
typeof err === "boolean" ||
typeof err === "bigint"
) {
return String(err);
}
if (typeof err === "symbol") return err.description ?? "";
if (err && typeof err === "object") {
const message = (err as { message?: unknown }).message;
if (typeof message === "string") return message;
}
return "";
}
export function resolveFailoverReasonFromError(
err: unknown,
): FailoverReason | null {
if (isFailoverError(err)) return err.reason;
const status = getStatusCode(err);
if (status === 402) return "billing";
if (status === 429) return "rate_limit";
if (status === 401 || status === 403) return "auth";
if (status === 408) return "timeout";
const code = (getErrorCode(err) ?? "").toUpperCase();
if (
["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes(
code,
)
) {
return "timeout";
}
const message = getErrorMessage(err);
if (!message) return null;
return classifyFailoverReason(message);
}
export function describeFailoverError(err: unknown): {
message: string;
reason?: FailoverReason;
status?: number;
code?: string;
} {
if (isFailoverError(err)) {
return {
message: err.message,
reason: err.reason,
status: err.status,
code: err.code,
};
}
const message = getErrorMessage(err) || String(err);
return {
message,
reason: resolveFailoverReasonFromError(err) ?? undefined,
status: getStatusCode(err),
code: getErrorCode(err),
};
}
export function coerceToFailoverError(
err: unknown,
context?: {
provider?: string;
model?: string;
profileId?: string;
},
): FailoverError | null {
if (isFailoverError(err)) return err;
const reason = resolveFailoverReasonFromError(err);
if (!reason) return null;
const message = getErrorMessage(err) || String(err);
const status = getStatusCode(err) ?? resolveFailoverStatus(reason);
const code = getErrorCode(err);
return new FailoverError(message, {
reason,
provider: context?.provider,
model: context?.model,
profileId: context?.profileId,
status,
code,
cause: err instanceof Error ? err : undefined,
});
}

View File

@@ -1,6 +1,10 @@
import type { ClawdbotConfig } from "../config/config.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
import { type FailoverError, isFailoverError } from "./failover-error.js";
import {
coerceToFailoverError,
describeFailoverError,
isFailoverError,
} from "./failover-error.js";
import {
buildModelAliasIndex,
modelKey,
@@ -9,7 +13,6 @@ import {
resolveModelRefFromString,
} from "./model-selection.js";
import type { FailoverReason } from "./pi-embedded-helpers.js";
import { isFailoverErrorMessage } from "./pi-embedded-helpers.js";
type ModelCandidate = {
provider: string;
@@ -36,81 +39,6 @@ function isAbortError(err: unknown): boolean {
return message.includes("aborted");
}
function getStatusCode(err: unknown): number | null {
if (!err || typeof err !== "object") return null;
const candidate =
(err as { status?: unknown; statusCode?: unknown }).status ??
(err as { statusCode?: unknown }).statusCode;
if (typeof candidate === "number") return candidate;
if (typeof candidate === "string" && /^\d+$/.test(candidate)) {
return Number(candidate);
}
return null;
}
function getErrorCode(err: unknown): string {
if (!err || typeof err !== "object") return "";
const candidate = (err as { code?: unknown }).code;
return typeof candidate === "string" ? candidate : "";
}
function getErrorMessage(err: unknown): string {
if (err instanceof Error) return err.message;
if (typeof err === "string") return err;
if (
typeof err === "number" ||
typeof err === "boolean" ||
typeof err === "bigint"
) {
return String(err);
}
if (typeof err === "symbol") return err.description ?? "";
if (err && typeof err === "object") {
const message = (err as { message?: unknown }).message;
if (typeof message === "string") return message;
}
return "";
}
function describeFallbackError(err: unknown): {
message: string;
reason?: FailoverReason;
status?: number;
code?: string;
} {
if (isFailoverError(err)) {
const fe = err as FailoverError;
return {
message: fe.message,
reason: fe.reason,
status: fe.status,
code: fe.code,
};
}
return {
message: getErrorMessage(err) || String(err),
status: getStatusCode(err) ?? undefined,
code: getErrorCode(err) || undefined,
};
}
function shouldFallbackForError(err: unknown): boolean {
if (isFailoverError(err)) return true;
const statusCode = getStatusCode(err);
if (statusCode && [401, 402, 403, 408, 429].includes(statusCode)) return true;
const code = getErrorCode(err).toUpperCase();
if (
["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes(
code,
)
) {
return true;
}
const message = getErrorMessage(err);
if (!message) return false;
return isFailoverErrorMessage(message);
}
function buildAllowedModelKeys(
cfg: ClawdbotConfig | undefined,
defaultProvider: string,
@@ -290,10 +218,15 @@ export async function runWithModelFallback<T>(params: {
};
} catch (err) {
if (isAbortError(err)) throw err;
const shouldFallback = shouldFallbackForError(err);
if (!shouldFallback) throw err;
lastError = err;
const described = describeFallbackError(err);
const normalized =
coerceToFailoverError(err, {
provider: candidate.provider,
model: candidate.model,
}) ?? err;
if (!isFailoverError(normalized)) throw err;
lastError = normalized;
const described = describeFailoverError(normalized);
attempts.push({
provider: candidate.provider,
model: candidate.model,
@@ -305,7 +238,7 @@ export async function runWithModelFallback<T>(params: {
await params.onError?.({
provider: candidate.provider,
model: candidate.model,
error: err,
error: normalized,
attempt: i + 1,
total: candidates.length,
});