refactor(agents): centralize failover normalization
This commit is contained in:
@@ -842,6 +842,57 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
|
||||
);
|
||||
}
|
||||
|
||||
type ResolvedAuthCooldownConfig = {
|
||||
billingBackoffMs: number;
|
||||
billingMaxMs: number;
|
||||
failureWindowMs: number;
|
||||
};
|
||||
|
||||
function resolveAuthCooldownConfig(params: {
|
||||
cfg?: ClawdbotConfig;
|
||||
providerId: string;
|
||||
}): ResolvedAuthCooldownConfig {
|
||||
const defaults = {
|
||||
billingBackoffHours: 5,
|
||||
billingMaxHours: 24,
|
||||
failureWindowHours: 24,
|
||||
} as const;
|
||||
|
||||
const resolveHours = (value: unknown, fallback: number) =>
|
||||
typeof value === "number" && Number.isFinite(value) && value > 0
|
||||
? value
|
||||
: fallback;
|
||||
|
||||
const cooldowns = params.cfg?.auth?.cooldowns;
|
||||
const billingOverride = (() => {
|
||||
const map = cooldowns?.billingBackoffHoursByProvider;
|
||||
if (!map) return undefined;
|
||||
for (const [key, value] of Object.entries(map)) {
|
||||
if (normalizeProviderId(key) === params.providerId) return value;
|
||||
}
|
||||
return undefined;
|
||||
})();
|
||||
|
||||
const billingBackoffHours = resolveHours(
|
||||
billingOverride ?? cooldowns?.billingBackoffHours,
|
||||
defaults.billingBackoffHours,
|
||||
);
|
||||
const billingMaxHours = resolveHours(
|
||||
cooldowns?.billingMaxHours,
|
||||
defaults.billingMaxHours,
|
||||
);
|
||||
const failureWindowHours = resolveHours(
|
||||
cooldowns?.failureWindowHours,
|
||||
defaults.failureWindowHours,
|
||||
);
|
||||
|
||||
return {
|
||||
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
|
||||
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
|
||||
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
|
||||
};
|
||||
}
|
||||
|
||||
function calculateAuthProfileBillingDisableMsWithConfig(params: {
|
||||
errorCount: number;
|
||||
baseMs: number;
|
||||
@@ -872,6 +923,49 @@ export function resolveProfileUnusableUntilForDisplay(
|
||||
return resolveProfileUnusableUntil(stats);
|
||||
}
|
||||
|
||||
function computeNextProfileUsageStats(params: {
|
||||
existing: ProfileUsageStats;
|
||||
now: number;
|
||||
reason: AuthProfileFailureReason;
|
||||
cfgResolved: ResolvedAuthCooldownConfig;
|
||||
}): ProfileUsageStats {
|
||||
const windowMs = params.cfgResolved.failureWindowMs;
|
||||
const windowExpired =
|
||||
typeof params.existing.lastFailureAt === "number" &&
|
||||
params.existing.lastFailureAt > 0 &&
|
||||
params.now - params.existing.lastFailureAt > windowMs;
|
||||
|
||||
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
|
||||
const nextErrorCount = baseErrorCount + 1;
|
||||
const failureCounts = windowExpired
|
||||
? {}
|
||||
: { ...params.existing.failureCounts };
|
||||
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
|
||||
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...params.existing,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
lastFailureAt: params.now,
|
||||
};
|
||||
|
||||
if (params.reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: billingCount,
|
||||
baseMs: params.cfgResolved.billingBackoffMs,
|
||||
maxMs: params.cfgResolved.billingMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = params.now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
updatedStats.cooldownUntil = params.now + backoffMs;
|
||||
}
|
||||
|
||||
return updatedStats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a profile as failed for a specific reason. Billing failures are treated
|
||||
* as "disabled" (longer backoff) vs the regular cooldown window.
|
||||
@@ -884,44 +978,6 @@ export async function markAuthProfileFailure(params: {
|
||||
agentDir?: string;
|
||||
}): Promise<void> {
|
||||
const { store, profileId, reason, agentDir, cfg } = params;
|
||||
const defaults = {
|
||||
billingBackoffHours: 5,
|
||||
billingMaxHours: 24,
|
||||
failureWindowHours: 24,
|
||||
} as const;
|
||||
const resolveHours = (value: unknown, fallback: number) =>
|
||||
typeof value === "number" && Number.isFinite(value) && value > 0
|
||||
? value
|
||||
: fallback;
|
||||
const resolveCooldownConfig = (providerId: string) => {
|
||||
const cooldowns = cfg?.auth?.cooldowns;
|
||||
const billingOverride = (() => {
|
||||
const map = cooldowns?.billingBackoffHoursByProvider;
|
||||
if (!map) return undefined;
|
||||
for (const [key, value] of Object.entries(map)) {
|
||||
if (normalizeProviderId(key) === providerId) return value;
|
||||
}
|
||||
return undefined;
|
||||
})();
|
||||
const billingBackoffHours = resolveHours(
|
||||
billingOverride ?? cooldowns?.billingBackoffHours,
|
||||
defaults.billingBackoffHours,
|
||||
);
|
||||
const billingMaxHours = resolveHours(
|
||||
cooldowns?.billingMaxHours,
|
||||
defaults.billingMaxHours,
|
||||
);
|
||||
const failureWindowHours = resolveHours(
|
||||
cooldowns?.failureWindowHours,
|
||||
defaults.failureWindowHours,
|
||||
);
|
||||
return {
|
||||
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
|
||||
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
|
||||
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
|
||||
};
|
||||
};
|
||||
|
||||
const updated = await updateAuthProfileStoreWithLock({
|
||||
agentDir,
|
||||
updater: (freshStore) => {
|
||||
@@ -932,41 +988,17 @@ export async function markAuthProfileFailure(params: {
|
||||
|
||||
const now = Date.now();
|
||||
const providerKey = normalizeProviderId(profile.provider);
|
||||
const cfgResolved = resolveCooldownConfig(providerKey);
|
||||
const cfgResolved = resolveAuthCooldownConfig({
|
||||
cfg,
|
||||
providerId: providerKey,
|
||||
});
|
||||
|
||||
const windowMs = cfgResolved.failureWindowMs;
|
||||
const windowExpired =
|
||||
typeof existing.lastFailureAt === "number" &&
|
||||
existing.lastFailureAt > 0 &&
|
||||
now - existing.lastFailureAt > windowMs;
|
||||
|
||||
const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0);
|
||||
const nextErrorCount = baseErrorCount + 1;
|
||||
const failureCounts = windowExpired ? {} : { ...existing.failureCounts };
|
||||
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
|
||||
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...existing,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
lastFailureAt: now,
|
||||
};
|
||||
|
||||
if (reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: billingCount,
|
||||
baseMs: cfgResolved.billingBackoffMs,
|
||||
maxMs: cfgResolved.billingMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
updatedStats.cooldownUntil = now + backoffMs;
|
||||
}
|
||||
|
||||
freshStore.usageStats[profileId] = updatedStats;
|
||||
freshStore.usageStats[profileId] = computeNextProfileUsageStats({
|
||||
existing,
|
||||
now,
|
||||
reason,
|
||||
cfgResolved,
|
||||
});
|
||||
return true;
|
||||
},
|
||||
});
|
||||
@@ -982,38 +1014,17 @@ export async function markAuthProfileFailure(params: {
|
||||
const providerKey = normalizeProviderId(
|
||||
store.profiles[profileId]?.provider ?? "",
|
||||
);
|
||||
const cfgResolved = resolveCooldownConfig(providerKey);
|
||||
const windowMs = cfgResolved.failureWindowMs;
|
||||
const windowExpired =
|
||||
typeof existing.lastFailureAt === "number" &&
|
||||
existing.lastFailureAt > 0 &&
|
||||
now - existing.lastFailureAt > windowMs;
|
||||
const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0);
|
||||
const nextErrorCount = baseErrorCount + 1;
|
||||
const failureCounts = windowExpired ? {} : { ...existing.failureCounts };
|
||||
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
|
||||
const cfgResolved = resolveAuthCooldownConfig({
|
||||
cfg,
|
||||
providerId: providerKey,
|
||||
});
|
||||
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...existing,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
lastFailureAt: now,
|
||||
};
|
||||
if (reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: billingCount,
|
||||
baseMs: cfgResolved.billingBackoffMs,
|
||||
maxMs: cfgResolved.billingMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
updatedStats.cooldownUntil = now + backoffMs;
|
||||
}
|
||||
|
||||
store.usageStats[profileId] = updatedStats;
|
||||
store.usageStats[profileId] = computeNextProfileUsageStats({
|
||||
existing,
|
||||
now,
|
||||
reason,
|
||||
cfgResolved,
|
||||
});
|
||||
saveAuthProfileStore(store, agentDir);
|
||||
}
|
||||
|
||||
|
||||
44
src/agents/failover-error.test.ts
Normal file
44
src/agents/failover-error.test.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
coerceToFailoverError,
|
||||
describeFailoverError,
|
||||
resolveFailoverReasonFromError,
|
||||
} from "./failover-error.js";
|
||||
|
||||
describe("failover-error", () => {
|
||||
it("infers failover reason from HTTP status", () => {
|
||||
expect(resolveFailoverReasonFromError({ status: 402 })).toBe("billing");
|
||||
expect(resolveFailoverReasonFromError({ statusCode: "429" })).toBe(
|
||||
"rate_limit",
|
||||
);
|
||||
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
|
||||
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
|
||||
});
|
||||
|
||||
it("infers timeout from common node error codes", () => {
|
||||
expect(resolveFailoverReasonFromError({ code: "ETIMEDOUT" })).toBe(
|
||||
"timeout",
|
||||
);
|
||||
expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe(
|
||||
"timeout",
|
||||
);
|
||||
});
|
||||
|
||||
it("coerces failover-worthy errors into FailoverError with metadata", () => {
|
||||
const err = coerceToFailoverError("credit balance too low", {
|
||||
provider: "anthropic",
|
||||
model: "claude-opus-4-5",
|
||||
});
|
||||
expect(err?.name).toBe("FailoverError");
|
||||
expect(err?.reason).toBe("billing");
|
||||
expect(err?.status).toBe(402);
|
||||
expect(err?.provider).toBe("anthropic");
|
||||
expect(err?.model).toBe("claude-opus-4-5");
|
||||
});
|
||||
|
||||
it("describes non-Error values consistently", () => {
|
||||
const described = describeFailoverError(123);
|
||||
expect(described.message).toBe("123");
|
||||
expect(described.reason).toBeUndefined();
|
||||
});
|
||||
});
|
||||
@@ -1,4 +1,7 @@
|
||||
import type { FailoverReason } from "./pi-embedded-helpers.js";
|
||||
import {
|
||||
classifyFailoverReason,
|
||||
type FailoverReason,
|
||||
} from "./pi-embedded-helpers.js";
|
||||
|
||||
export class FailoverError extends Error {
|
||||
readonly reason: FailoverReason;
|
||||
@@ -51,3 +54,116 @@ export function resolveFailoverStatus(
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function getStatusCode(err: unknown): number | undefined {
|
||||
if (!err || typeof err !== "object") return undefined;
|
||||
const candidate =
|
||||
(err as { status?: unknown; statusCode?: unknown }).status ??
|
||||
(err as { statusCode?: unknown }).statusCode;
|
||||
if (typeof candidate === "number") return candidate;
|
||||
if (typeof candidate === "string" && /^\d+$/.test(candidate)) {
|
||||
return Number(candidate);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function getErrorCode(err: unknown): string | undefined {
|
||||
if (!err || typeof err !== "object") return undefined;
|
||||
const candidate = (err as { code?: unknown }).code;
|
||||
if (typeof candidate !== "string") return undefined;
|
||||
const trimmed = candidate.trim();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
function getErrorMessage(err: unknown): string {
|
||||
if (err instanceof Error) return err.message;
|
||||
if (typeof err === "string") return err;
|
||||
if (
|
||||
typeof err === "number" ||
|
||||
typeof err === "boolean" ||
|
||||
typeof err === "bigint"
|
||||
) {
|
||||
return String(err);
|
||||
}
|
||||
if (typeof err === "symbol") return err.description ?? "";
|
||||
if (err && typeof err === "object") {
|
||||
const message = (err as { message?: unknown }).message;
|
||||
if (typeof message === "string") return message;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
export function resolveFailoverReasonFromError(
|
||||
err: unknown,
|
||||
): FailoverReason | null {
|
||||
if (isFailoverError(err)) return err.reason;
|
||||
|
||||
const status = getStatusCode(err);
|
||||
if (status === 402) return "billing";
|
||||
if (status === 429) return "rate_limit";
|
||||
if (status === 401 || status === 403) return "auth";
|
||||
if (status === 408) return "timeout";
|
||||
|
||||
const code = (getErrorCode(err) ?? "").toUpperCase();
|
||||
if (
|
||||
["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes(
|
||||
code,
|
||||
)
|
||||
) {
|
||||
return "timeout";
|
||||
}
|
||||
|
||||
const message = getErrorMessage(err);
|
||||
if (!message) return null;
|
||||
return classifyFailoverReason(message);
|
||||
}
|
||||
|
||||
export function describeFailoverError(err: unknown): {
|
||||
message: string;
|
||||
reason?: FailoverReason;
|
||||
status?: number;
|
||||
code?: string;
|
||||
} {
|
||||
if (isFailoverError(err)) {
|
||||
return {
|
||||
message: err.message,
|
||||
reason: err.reason,
|
||||
status: err.status,
|
||||
code: err.code,
|
||||
};
|
||||
}
|
||||
const message = getErrorMessage(err) || String(err);
|
||||
return {
|
||||
message,
|
||||
reason: resolveFailoverReasonFromError(err) ?? undefined,
|
||||
status: getStatusCode(err),
|
||||
code: getErrorCode(err),
|
||||
};
|
||||
}
|
||||
|
||||
export function coerceToFailoverError(
|
||||
err: unknown,
|
||||
context?: {
|
||||
provider?: string;
|
||||
model?: string;
|
||||
profileId?: string;
|
||||
},
|
||||
): FailoverError | null {
|
||||
if (isFailoverError(err)) return err;
|
||||
const reason = resolveFailoverReasonFromError(err);
|
||||
if (!reason) return null;
|
||||
|
||||
const message = getErrorMessage(err) || String(err);
|
||||
const status = getStatusCode(err) ?? resolveFailoverStatus(reason);
|
||||
const code = getErrorCode(err);
|
||||
|
||||
return new FailoverError(message, {
|
||||
reason,
|
||||
provider: context?.provider,
|
||||
model: context?.model,
|
||||
profileId: context?.profileId,
|
||||
status,
|
||||
code,
|
||||
cause: err instanceof Error ? err : undefined,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
|
||||
import { type FailoverError, isFailoverError } from "./failover-error.js";
|
||||
import {
|
||||
coerceToFailoverError,
|
||||
describeFailoverError,
|
||||
isFailoverError,
|
||||
} from "./failover-error.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
modelKey,
|
||||
@@ -9,7 +13,6 @@ import {
|
||||
resolveModelRefFromString,
|
||||
} from "./model-selection.js";
|
||||
import type { FailoverReason } from "./pi-embedded-helpers.js";
|
||||
import { isFailoverErrorMessage } from "./pi-embedded-helpers.js";
|
||||
|
||||
type ModelCandidate = {
|
||||
provider: string;
|
||||
@@ -36,81 +39,6 @@ function isAbortError(err: unknown): boolean {
|
||||
return message.includes("aborted");
|
||||
}
|
||||
|
||||
function getStatusCode(err: unknown): number | null {
|
||||
if (!err || typeof err !== "object") return null;
|
||||
const candidate =
|
||||
(err as { status?: unknown; statusCode?: unknown }).status ??
|
||||
(err as { statusCode?: unknown }).statusCode;
|
||||
if (typeof candidate === "number") return candidate;
|
||||
if (typeof candidate === "string" && /^\d+$/.test(candidate)) {
|
||||
return Number(candidate);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function getErrorCode(err: unknown): string {
|
||||
if (!err || typeof err !== "object") return "";
|
||||
const candidate = (err as { code?: unknown }).code;
|
||||
return typeof candidate === "string" ? candidate : "";
|
||||
}
|
||||
|
||||
function getErrorMessage(err: unknown): string {
|
||||
if (err instanceof Error) return err.message;
|
||||
if (typeof err === "string") return err;
|
||||
if (
|
||||
typeof err === "number" ||
|
||||
typeof err === "boolean" ||
|
||||
typeof err === "bigint"
|
||||
) {
|
||||
return String(err);
|
||||
}
|
||||
if (typeof err === "symbol") return err.description ?? "";
|
||||
if (err && typeof err === "object") {
|
||||
const message = (err as { message?: unknown }).message;
|
||||
if (typeof message === "string") return message;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function describeFallbackError(err: unknown): {
|
||||
message: string;
|
||||
reason?: FailoverReason;
|
||||
status?: number;
|
||||
code?: string;
|
||||
} {
|
||||
if (isFailoverError(err)) {
|
||||
const fe = err as FailoverError;
|
||||
return {
|
||||
message: fe.message,
|
||||
reason: fe.reason,
|
||||
status: fe.status,
|
||||
code: fe.code,
|
||||
};
|
||||
}
|
||||
return {
|
||||
message: getErrorMessage(err) || String(err),
|
||||
status: getStatusCode(err) ?? undefined,
|
||||
code: getErrorCode(err) || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function shouldFallbackForError(err: unknown): boolean {
|
||||
if (isFailoverError(err)) return true;
|
||||
const statusCode = getStatusCode(err);
|
||||
if (statusCode && [401, 402, 403, 408, 429].includes(statusCode)) return true;
|
||||
const code = getErrorCode(err).toUpperCase();
|
||||
if (
|
||||
["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes(
|
||||
code,
|
||||
)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
const message = getErrorMessage(err);
|
||||
if (!message) return false;
|
||||
return isFailoverErrorMessage(message);
|
||||
}
|
||||
|
||||
function buildAllowedModelKeys(
|
||||
cfg: ClawdbotConfig | undefined,
|
||||
defaultProvider: string,
|
||||
@@ -290,10 +218,15 @@ export async function runWithModelFallback<T>(params: {
|
||||
};
|
||||
} catch (err) {
|
||||
if (isAbortError(err)) throw err;
|
||||
const shouldFallback = shouldFallbackForError(err);
|
||||
if (!shouldFallback) throw err;
|
||||
lastError = err;
|
||||
const described = describeFallbackError(err);
|
||||
const normalized =
|
||||
coerceToFailoverError(err, {
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
}) ?? err;
|
||||
if (!isFailoverError(normalized)) throw err;
|
||||
|
||||
lastError = normalized;
|
||||
const described = describeFailoverError(normalized);
|
||||
attempts.push({
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
@@ -305,7 +238,7 @@ export async function runWithModelFallback<T>(params: {
|
||||
await params.onError?.({
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
error: err,
|
||||
error: normalized,
|
||||
attempt: i + 1,
|
||||
total: candidates.length,
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user