fix(auth): billing backoff + cooldown UX
This commit is contained in:
@@ -11,6 +11,7 @@ import {
|
||||
CODEX_CLI_PROFILE_ID,
|
||||
calculateAuthProfileCooldownMs,
|
||||
ensureAuthProfileStore,
|
||||
markAuthProfileFailure,
|
||||
resolveAuthProfileOrder,
|
||||
} from "./auth-profiles.js";
|
||||
|
||||
@@ -440,6 +441,138 @@ describe("auth profile cooldowns", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("markAuthProfileFailure", () => {
|
||||
it("disables billing failures for ~5 hours by default", async () => {
|
||||
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "clawdbot-auth-"));
|
||||
try {
|
||||
const authPath = path.join(agentDir, "auth-profiles.json");
|
||||
fs.writeFileSync(
|
||||
authPath,
|
||||
JSON.stringify({
|
||||
version: 1,
|
||||
profiles: {
|
||||
"anthropic:default": {
|
||||
type: "api_key",
|
||||
provider: "anthropic",
|
||||
key: "sk-default",
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const store = ensureAuthProfileStore(agentDir);
|
||||
const startedAt = Date.now();
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "billing",
|
||||
agentDir,
|
||||
});
|
||||
|
||||
const disabledUntil =
|
||||
store.usageStats?.["anthropic:default"]?.disabledUntil;
|
||||
expect(typeof disabledUntil).toBe("number");
|
||||
const remainingMs = (disabledUntil as number) - startedAt;
|
||||
expect(remainingMs).toBeGreaterThan(4.5 * 60 * 60 * 1000);
|
||||
expect(remainingMs).toBeLessThan(5.5 * 60 * 60 * 1000);
|
||||
} finally {
|
||||
fs.rmSync(agentDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("honors per-provider billing backoff overrides", async () => {
|
||||
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "clawdbot-auth-"));
|
||||
try {
|
||||
const authPath = path.join(agentDir, "auth-profiles.json");
|
||||
fs.writeFileSync(
|
||||
authPath,
|
||||
JSON.stringify({
|
||||
version: 1,
|
||||
profiles: {
|
||||
"anthropic:default": {
|
||||
type: "api_key",
|
||||
provider: "anthropic",
|
||||
key: "sk-default",
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const store = ensureAuthProfileStore(agentDir);
|
||||
const startedAt = Date.now();
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "billing",
|
||||
agentDir,
|
||||
cfg: {
|
||||
auth: {
|
||||
cooldowns: {
|
||||
billingBackoffHoursByProvider: { Anthropic: 1 },
|
||||
billingMaxHours: 2,
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
});
|
||||
|
||||
const disabledUntil =
|
||||
store.usageStats?.["anthropic:default"]?.disabledUntil;
|
||||
expect(typeof disabledUntil).toBe("number");
|
||||
const remainingMs = (disabledUntil as number) - startedAt;
|
||||
expect(remainingMs).toBeGreaterThan(0.8 * 60 * 60 * 1000);
|
||||
expect(remainingMs).toBeLessThan(1.2 * 60 * 60 * 1000);
|
||||
} finally {
|
||||
fs.rmSync(agentDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("resets backoff counters outside the failure window", async () => {
|
||||
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "clawdbot-auth-"));
|
||||
try {
|
||||
const authPath = path.join(agentDir, "auth-profiles.json");
|
||||
const now = Date.now();
|
||||
fs.writeFileSync(
|
||||
authPath,
|
||||
JSON.stringify({
|
||||
version: 1,
|
||||
profiles: {
|
||||
"anthropic:default": {
|
||||
type: "api_key",
|
||||
provider: "anthropic",
|
||||
key: "sk-default",
|
||||
},
|
||||
},
|
||||
usageStats: {
|
||||
"anthropic:default": {
|
||||
errorCount: 9,
|
||||
failureCounts: { billing: 3 },
|
||||
lastFailureAt: now - 48 * 60 * 60 * 1000,
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const store = ensureAuthProfileStore(agentDir);
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "billing",
|
||||
agentDir,
|
||||
cfg: {
|
||||
auth: { cooldowns: { failureWindowHours: 24 } },
|
||||
} as never,
|
||||
});
|
||||
|
||||
expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(1);
|
||||
expect(
|
||||
store.usageStats?.["anthropic:default"]?.failureCounts?.billing,
|
||||
).toBe(1);
|
||||
} finally {
|
||||
fs.rmSync(agentDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("external CLI credential sync", () => {
|
||||
it("syncs Claude CLI credentials into anthropic:claude-cli", async () => {
|
||||
const agentDir = fs.mkdtempSync(
|
||||
|
||||
@@ -87,6 +87,7 @@ export type ProfileUsageStats = {
|
||||
disabledReason?: AuthProfileFailureReason;
|
||||
errorCount?: number;
|
||||
failureCounts?: Partial<Record<AuthProfileFailureReason, number>>;
|
||||
lastFailureAt?: number;
|
||||
};
|
||||
|
||||
export type AuthProfileStore = {
|
||||
@@ -841,15 +842,17 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
|
||||
);
|
||||
}
|
||||
|
||||
function calculateAuthProfileBillingDisableMs(errorCount: number): number {
|
||||
const normalized = Math.max(1, errorCount);
|
||||
const steps = [
|
||||
30 * 60 * 1000, // 30 min
|
||||
2 * 60 * 60 * 1000, // 2 hours
|
||||
8 * 60 * 60 * 1000, // 8 hours
|
||||
24 * 60 * 60 * 1000, // 24 hours
|
||||
];
|
||||
return steps[Math.min(normalized - 1, steps.length - 1)] as number;
|
||||
function calculateAuthProfileBillingDisableMsWithConfig(params: {
|
||||
errorCount: number;
|
||||
baseMs: number;
|
||||
maxMs: number;
|
||||
}): number {
|
||||
const normalized = Math.max(1, params.errorCount);
|
||||
const baseMs = Math.max(60_000, params.baseMs);
|
||||
const maxMs = Math.max(baseMs, params.maxMs);
|
||||
const exponent = Math.min(normalized - 1, 10);
|
||||
const raw = baseMs * 2 ** exponent;
|
||||
return Math.min(maxMs, raw);
|
||||
}
|
||||
|
||||
function resolveProfileUnusableUntil(stats: ProfileUsageStats): number | null {
|
||||
@@ -877,30 +880,85 @@ export async function markAuthProfileFailure(params: {
|
||||
store: AuthProfileStore;
|
||||
profileId: string;
|
||||
reason: AuthProfileFailureReason;
|
||||
cfg?: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
}): Promise<void> {
|
||||
const { store, profileId, reason, agentDir } = params;
|
||||
const { store, profileId, reason, agentDir, cfg } = params;
|
||||
const defaults = {
|
||||
billingBackoffHours: 5,
|
||||
billingMaxHours: 24,
|
||||
failureWindowHours: 24,
|
||||
} as const;
|
||||
const resolveHours = (value: unknown, fallback: number) =>
|
||||
typeof value === "number" && Number.isFinite(value) && value > 0
|
||||
? value
|
||||
: fallback;
|
||||
const resolveCooldownConfig = (providerId: string) => {
|
||||
const cooldowns = cfg?.auth?.cooldowns;
|
||||
const billingOverride = (() => {
|
||||
const map = cooldowns?.billingBackoffHoursByProvider;
|
||||
if (!map) return undefined;
|
||||
for (const [key, value] of Object.entries(map)) {
|
||||
if (normalizeProviderId(key) === providerId) return value;
|
||||
}
|
||||
return undefined;
|
||||
})();
|
||||
const billingBackoffHours = resolveHours(
|
||||
billingOverride ?? cooldowns?.billingBackoffHours,
|
||||
defaults.billingBackoffHours,
|
||||
);
|
||||
const billingMaxHours = resolveHours(
|
||||
cooldowns?.billingMaxHours,
|
||||
defaults.billingMaxHours,
|
||||
);
|
||||
const failureWindowHours = resolveHours(
|
||||
cooldowns?.failureWindowHours,
|
||||
defaults.failureWindowHours,
|
||||
);
|
||||
return {
|
||||
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
|
||||
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
|
||||
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
|
||||
};
|
||||
};
|
||||
|
||||
const updated = await updateAuthProfileStoreWithLock({
|
||||
agentDir,
|
||||
updater: (freshStore) => {
|
||||
if (!freshStore.profiles[profileId]) return false;
|
||||
const profile = freshStore.profiles[profileId];
|
||||
if (!profile) return false;
|
||||
freshStore.usageStats = freshStore.usageStats ?? {};
|
||||
const existing = freshStore.usageStats[profileId] ?? {};
|
||||
|
||||
const nextErrorCount = (existing.errorCount ?? 0) + 1;
|
||||
const failureCounts = { ...existing.failureCounts };
|
||||
const now = Date.now();
|
||||
const providerKey = normalizeProviderId(profile.provider);
|
||||
const cfgResolved = resolveCooldownConfig(providerKey);
|
||||
|
||||
const windowMs = cfgResolved.failureWindowMs;
|
||||
const windowExpired =
|
||||
typeof existing.lastFailureAt === "number" &&
|
||||
existing.lastFailureAt > 0 &&
|
||||
now - existing.lastFailureAt > windowMs;
|
||||
|
||||
const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0);
|
||||
const nextErrorCount = baseErrorCount + 1;
|
||||
const failureCounts = windowExpired ? {} : { ...existing.failureCounts };
|
||||
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
|
||||
|
||||
const now = Date.now();
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...existing,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
lastFailureAt: now,
|
||||
};
|
||||
|
||||
if (reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: billingCount,
|
||||
baseMs: cfgResolved.billingBackoffMs,
|
||||
maxMs: cfgResolved.billingMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
@@ -920,19 +978,34 @@ export async function markAuthProfileFailure(params: {
|
||||
|
||||
store.usageStats = store.usageStats ?? {};
|
||||
const existing = store.usageStats[profileId] ?? {};
|
||||
const nextErrorCount = (existing.errorCount ?? 0) + 1;
|
||||
const failureCounts = { ...existing.failureCounts };
|
||||
const now = Date.now();
|
||||
const providerKey = normalizeProviderId(
|
||||
store.profiles[profileId]?.provider ?? "",
|
||||
);
|
||||
const cfgResolved = resolveCooldownConfig(providerKey);
|
||||
const windowMs = cfgResolved.failureWindowMs;
|
||||
const windowExpired =
|
||||
typeof existing.lastFailureAt === "number" &&
|
||||
existing.lastFailureAt > 0 &&
|
||||
now - existing.lastFailureAt > windowMs;
|
||||
const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0);
|
||||
const nextErrorCount = baseErrorCount + 1;
|
||||
const failureCounts = windowExpired ? {} : { ...existing.failureCounts };
|
||||
failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;
|
||||
|
||||
const now = Date.now();
|
||||
const updatedStats: ProfileUsageStats = {
|
||||
...existing,
|
||||
errorCount: nextErrorCount,
|
||||
failureCounts,
|
||||
lastFailureAt: now,
|
||||
};
|
||||
if (reason === "billing") {
|
||||
const billingCount = failureCounts.billing ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: billingCount,
|
||||
baseMs: cfgResolved.billingBackoffMs,
|
||||
maxMs: cfgResolved.billingMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
|
||||
@@ -9,9 +9,12 @@ import { shouldLogVerbose } from "../globals.js";
|
||||
import { createSubsystemLogger } from "../logging.js";
|
||||
import { runCommandWithTimeout } from "../process/exec.js";
|
||||
import { resolveUserPath } from "../utils.js";
|
||||
import { FailoverError, resolveFailoverStatus } from "./failover-error.js";
|
||||
import {
|
||||
buildBootstrapContextFiles,
|
||||
classifyFailoverReason,
|
||||
type EmbeddedContextFile,
|
||||
isFailoverErrorMessage,
|
||||
} from "./pi-embedded-helpers.js";
|
||||
import type { EmbeddedPiRunResult } from "./pi-embedded-runner.js";
|
||||
import { buildAgentSystemPrompt } from "./system-prompt.js";
|
||||
@@ -310,6 +313,16 @@ async function runClaudeCliOnce(params: {
|
||||
}
|
||||
if (result.code !== 0) {
|
||||
const err = result.stderr.trim() || stdout || "Claude CLI failed.";
|
||||
if (isFailoverErrorMessage(err)) {
|
||||
const reason = classifyFailoverReason(err) ?? "unknown";
|
||||
const status = resolveFailoverStatus(reason);
|
||||
throw new FailoverError(err, {
|
||||
reason,
|
||||
provider: "claude-cli",
|
||||
model: params.modelId,
|
||||
status,
|
||||
});
|
||||
}
|
||||
throw new Error(err);
|
||||
}
|
||||
const parsed = parseClaudeCliJson(stdout);
|
||||
|
||||
53
src/agents/failover-error.ts
Normal file
53
src/agents/failover-error.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
import type { FailoverReason } from "./pi-embedded-helpers.js";
|
||||
|
||||
export class FailoverError extends Error {
|
||||
readonly reason: FailoverReason;
|
||||
readonly provider?: string;
|
||||
readonly model?: string;
|
||||
readonly profileId?: string;
|
||||
readonly status?: number;
|
||||
readonly code?: string;
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
params: {
|
||||
reason: FailoverReason;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
profileId?: string;
|
||||
status?: number;
|
||||
code?: string;
|
||||
cause?: unknown;
|
||||
},
|
||||
) {
|
||||
super(message, { cause: params.cause });
|
||||
this.name = "FailoverError";
|
||||
this.reason = params.reason;
|
||||
this.provider = params.provider;
|
||||
this.model = params.model;
|
||||
this.profileId = params.profileId;
|
||||
this.status = params.status;
|
||||
this.code = params.code;
|
||||
}
|
||||
}
|
||||
|
||||
export function isFailoverError(err: unknown): err is FailoverError {
|
||||
return err instanceof FailoverError;
|
||||
}
|
||||
|
||||
export function resolveFailoverStatus(
|
||||
reason: FailoverReason,
|
||||
): number | undefined {
|
||||
switch (reason) {
|
||||
case "billing":
|
||||
return 402;
|
||||
case "rate_limit":
|
||||
return 429;
|
||||
case "auth":
|
||||
return 401;
|
||||
case "timeout":
|
||||
return 408;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
|
||||
import { type FailoverError, isFailoverError } from "./failover-error.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
modelKey,
|
||||
@@ -7,6 +8,7 @@ import {
|
||||
resolveConfiguredModelRef,
|
||||
resolveModelRefFromString,
|
||||
} from "./model-selection.js";
|
||||
import type { FailoverReason } from "./pi-embedded-helpers.js";
|
||||
import { isFailoverErrorMessage } from "./pi-embedded-helpers.js";
|
||||
|
||||
type ModelCandidate = {
|
||||
@@ -18,6 +20,9 @@ type FallbackAttempt = {
|
||||
provider: string;
|
||||
model: string;
|
||||
error: string;
|
||||
reason?: FailoverReason;
|
||||
status?: number;
|
||||
code?: string;
|
||||
};
|
||||
|
||||
function isAbortError(err: unknown): boolean {
|
||||
@@ -67,9 +72,32 @@ function getErrorMessage(err: unknown): string {
|
||||
return "";
|
||||
}
|
||||
|
||||
function describeFallbackError(err: unknown): {
|
||||
message: string;
|
||||
reason?: FailoverReason;
|
||||
status?: number;
|
||||
code?: string;
|
||||
} {
|
||||
if (isFailoverError(err)) {
|
||||
const fe = err as FailoverError;
|
||||
return {
|
||||
message: fe.message,
|
||||
reason: fe.reason,
|
||||
status: fe.status,
|
||||
code: fe.code,
|
||||
};
|
||||
}
|
||||
return {
|
||||
message: getErrorMessage(err) || String(err),
|
||||
status: getStatusCode(err) ?? undefined,
|
||||
code: getErrorCode(err) || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function shouldFallbackForError(err: unknown): boolean {
|
||||
if (isFailoverError(err)) return true;
|
||||
const statusCode = getStatusCode(err);
|
||||
if (statusCode && [401, 402, 403, 429].includes(statusCode)) return true;
|
||||
if (statusCode && [401, 402, 403, 408, 429].includes(statusCode)) return true;
|
||||
const code = getErrorCode(err).toUpperCase();
|
||||
if (
|
||||
["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes(
|
||||
@@ -265,10 +293,14 @@ export async function runWithModelFallback<T>(params: {
|
||||
const shouldFallback = shouldFallbackForError(err);
|
||||
if (!shouldFallback) throw err;
|
||||
lastError = err;
|
||||
const described = describeFallbackError(err);
|
||||
attempts.push({
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
error: described.message,
|
||||
reason: described.reason,
|
||||
status: described.status,
|
||||
code: described.code,
|
||||
});
|
||||
await params.onError?.({
|
||||
provider: candidate.provider,
|
||||
@@ -286,7 +318,9 @@ export async function runWithModelFallback<T>(params: {
|
||||
? attempts
|
||||
.map(
|
||||
(attempt) =>
|
||||
`${attempt.provider}/${attempt.model}: ${attempt.error}`,
|
||||
`${attempt.provider}/${attempt.model}: ${attempt.error}${
|
||||
attempt.reason ? ` (${attempt.reason})` : ""
|
||||
}`,
|
||||
)
|
||||
.join(" | ")
|
||||
: "unknown";
|
||||
|
||||
@@ -47,6 +47,7 @@ import {
|
||||
DEFAULT_MODEL,
|
||||
DEFAULT_PROVIDER,
|
||||
} from "./defaults.js";
|
||||
import { FailoverError, resolveFailoverStatus } from "./failover-error.js";
|
||||
import {
|
||||
ensureAuthProfileStore,
|
||||
getApiKeyForModel,
|
||||
@@ -1448,6 +1449,7 @@ export async function runEmbeddedPiAgent(params: {
|
||||
store: authStore,
|
||||
profileId: lastProfileId,
|
||||
reason: promptFailoverReason,
|
||||
cfg: params.config,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
}
|
||||
@@ -1515,6 +1517,7 @@ export async function runEmbeddedPiAgent(params: {
|
||||
store: authStore,
|
||||
profileId: lastProfileId,
|
||||
reason,
|
||||
cfg: params.config,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
if (timedOut) {
|
||||
@@ -1540,22 +1543,16 @@ export async function runEmbeddedPiAgent(params: {
|
||||
: authFailure
|
||||
? "LLM request unauthorized."
|
||||
: "LLM request failed.");
|
||||
const err = new Error(message);
|
||||
(err as { failoverReason?: string }).failoverReason =
|
||||
assistantFailoverReason ?? undefined;
|
||||
if (assistantFailoverReason === "billing") {
|
||||
(err as { status?: number }).status = 402;
|
||||
} else if (assistantFailoverReason === "rate_limit") {
|
||||
(err as { status?: number }).status = 429;
|
||||
} else if (assistantFailoverReason === "auth") {
|
||||
(err as { status?: number }).status = 401;
|
||||
} else if (
|
||||
assistantFailoverReason === "timeout" ||
|
||||
isTimeoutErrorMessage(message)
|
||||
) {
|
||||
(err as { status?: number }).status = 408;
|
||||
}
|
||||
throw err;
|
||||
const status =
|
||||
resolveFailoverStatus(assistantFailoverReason ?? "unknown") ??
|
||||
(isTimeoutErrorMessage(message) ? 408 : undefined);
|
||||
throw new FailoverError(message, {
|
||||
reason: assistantFailoverReason ?? "unknown",
|
||||
provider,
|
||||
model: modelId,
|
||||
profileId: lastProfileId,
|
||||
status,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user