From c27b1441f70a893e95198c40269e70c39c4869d8 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 9 Jan 2026 21:57:52 +0100 Subject: [PATCH] fix(auth): billing backoff + cooldown UX --- CHANGELOG.md | 1 + docs/concepts/model-failover.md | 25 +++++- docs/gateway/doctor.md | 6 +- docs/start/faq.md | 2 + src/agents/auth-profiles.test.ts | 133 +++++++++++++++++++++++++++++++ src/agents/auth-profiles.ts | 111 +++++++++++++++++++++----- src/agents/claude-cli-runner.ts | 13 +++ src/agents/failover-error.ts | 53 ++++++++++++ src/agents/model-fallback.ts | 40 +++++++++- src/agents/pi-embedded-runner.ts | 29 +++---- src/commands/doctor-auth.ts | 27 +++++++ src/commands/models.list.test.ts | 2 + src/commands/models/list.ts | 63 ++++++++++++++- src/config/schema.ts | 12 +++ src/config/types.ts | 13 +++ src/config/zod-schema.ts | 10 +++ 16 files changed, 497 insertions(+), 43 deletions(-) create mode 100644 src/agents/failover-error.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index d51c38094..cb91212f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ - Models: centralize model override validation + hooks Gmail warnings in doctor. (#602) — thanks @steipete - Agents: avoid base-to-string error stringification in model fallback. (#604) — thanks @steipete - Agents: treat billing/insufficient-credits errors as failover-worthy so model fallbacks kick in. (#486) — thanks @steipete +- Auth: default billing disable backoff to 5h (doubling, 24h cap) and surface disabled/cooldown profiles in `models list` + doctor. (#486) — thanks @steipete - Commands: harden slash command registry and list text-only commands in `/commands`. - Models/Auth: show per-agent auth candidates in `/model status`, and add `clawdbot models auth order {get,set,clear}` (per-agent auth rotation overrides). — thanks @steipete - Debugging: add raw model stream logging flags and document gateway watch mode. diff --git a/docs/concepts/model-failover.md b/docs/concepts/model-failover.md index 4dc1dbf61..d3a438800 100644 --- a/docs/concepts/model-failover.md +++ b/docs/concepts/model-failover.md @@ -46,7 +46,7 @@ When a provider has multiple profiles, Clawdbot chooses an order like this: If no explicit order is configured, Clawdbot uses a round‑robin order: - **Primary key:** profile type (**OAuth before API keys**). - **Secondary key:** `usageStats.lastUsed` (oldest first, within each type). -- **Cooldown profiles** are moved to the end, ordered by soonest cooldown expiry. +- **Cooldown/disabled profiles** are moved to the end, ordered by soonest expiry. ### Why OAuth can “look lost” @@ -79,6 +79,27 @@ State is stored in `auth-profiles.json` under `usageStats`: } ``` +## Billing disables + +Billing/credit failures (for example “insufficient credits” / “credit balance too low”) are treated as failover‑worthy, but they’re usually not transient. Instead of a short cooldown, Clawdbot marks the profile as **disabled** (with a longer backoff) and rotates to the next profile/provider. + +State is stored in `auth-profiles.json`: + +```json +{ + "usageStats": { + "provider:profile": { + "disabledUntil": 1736178000000, + "disabledReason": "billing" + } + } +} +``` + +Defaults: +- Billing backoff starts at **5 hours**, doubles per billing failure, and caps at **24 hours**. +- Backoff counters reset if the profile hasn’t failed for **24 hours** (configurable). + ## Model fallback If all profiles for a provider fail, Clawdbot moves to the next model in @@ -92,6 +113,8 @@ When a run starts with a model override (hooks or CLI), fallbacks still end at See [`docs/configuration.md`](/gateway/configuration) for: - `auth.profiles` / `auth.order` +- `auth.cooldowns.billingBackoffHours` / `auth.cooldowns.billingBackoffHoursByProvider` +- `auth.cooldowns.billingMaxHours` / `auth.cooldowns.failureWindowHours` - `agents.defaults.model.primary` / `agents.defaults.model.fallbacks` - `agents.defaults.imageModel` routing diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md index 7120af4ba..efcf0363f 100644 --- a/docs/gateway/doctor.md +++ b/docs/gateway/doctor.md @@ -61,7 +61,7 @@ cat ~/.clawdbot/clawdbot.json - Legacy on-disk state migration (sessions/agent dir/WhatsApp auth). - State integrity and permissions checks (sessions, transcripts, state dir). - Config file permission checks (chmod 600) when running locally. -- Model auth health: checks OAuth expiry and can refresh expiring tokens. +- Model auth health: checks OAuth expiry, can refresh expiring tokens, and reports auth-profile cooldown/disabled states. - Legacy workspace dir detection (`~/clawdis`, `~/clawdbot`). - Sandbox image repair when sandboxing is enabled. - Legacy service migration and extra gateway detection. @@ -153,6 +153,10 @@ profile is stale, it suggests `claude setup-token` on the gateway host. Refresh prompts only appear when running interactively (TTY); `--non-interactive` skips refresh attempts. +Doctor also reports auth profiles that are temporarily unusable due to: +- short cooldowns (rate limits/timeouts/auth failures) +- longer disables (billing/credit failures) + ### 6) Sandbox image repair When sandboxing is enabled, doctor checks Docker images and offers to build or switch to legacy names if the current image is missing. diff --git a/docs/start/faq.md b/docs/start/faq.md index 3d3c25ebd..327713c10 100644 --- a/docs/start/faq.md +++ b/docs/start/faq.md @@ -422,6 +422,8 @@ Clawdbot uses provider‑prefixed IDs like: Yes. Config supports optional metadata for profiles and an ordering per provider (`auth.order.`). This does **not** store secrets; it maps IDs to provider/mode and sets rotation order. +Clawdbot may temporarily skip a profile if it’s in a short **cooldown** (rate limits/timeouts/auth failures) or a longer **disabled** state (billing/insufficient credits). To inspect this, run `clawdbot models status --json` and check `auth.unusableProfiles`. Tuning: `auth.cooldowns.billingBackoffHours*`. + You can also set a **per-agent** order override (stored in that agent’s `auth-profiles.json`) via the CLI: ```bash diff --git a/src/agents/auth-profiles.test.ts b/src/agents/auth-profiles.test.ts index 3b2185805..d4bdae3ff 100644 --- a/src/agents/auth-profiles.test.ts +++ b/src/agents/auth-profiles.test.ts @@ -11,6 +11,7 @@ import { CODEX_CLI_PROFILE_ID, calculateAuthProfileCooldownMs, ensureAuthProfileStore, + markAuthProfileFailure, resolveAuthProfileOrder, } from "./auth-profiles.js"; @@ -440,6 +441,138 @@ describe("auth profile cooldowns", () => { }); }); +describe("markAuthProfileFailure", () => { + it("disables billing failures for ~5 hours by default", async () => { + const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "clawdbot-auth-")); + try { + const authPath = path.join(agentDir, "auth-profiles.json"); + fs.writeFileSync( + authPath, + JSON.stringify({ + version: 1, + profiles: { + "anthropic:default": { + type: "api_key", + provider: "anthropic", + key: "sk-default", + }, + }, + }), + ); + + const store = ensureAuthProfileStore(agentDir); + const startedAt = Date.now(); + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "billing", + agentDir, + }); + + const disabledUntil = + store.usageStats?.["anthropic:default"]?.disabledUntil; + expect(typeof disabledUntil).toBe("number"); + const remainingMs = (disabledUntil as number) - startedAt; + expect(remainingMs).toBeGreaterThan(4.5 * 60 * 60 * 1000); + expect(remainingMs).toBeLessThan(5.5 * 60 * 60 * 1000); + } finally { + fs.rmSync(agentDir, { recursive: true, force: true }); + } + }); + + it("honors per-provider billing backoff overrides", async () => { + const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "clawdbot-auth-")); + try { + const authPath = path.join(agentDir, "auth-profiles.json"); + fs.writeFileSync( + authPath, + JSON.stringify({ + version: 1, + profiles: { + "anthropic:default": { + type: "api_key", + provider: "anthropic", + key: "sk-default", + }, + }, + }), + ); + + const store = ensureAuthProfileStore(agentDir); + const startedAt = Date.now(); + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "billing", + agentDir, + cfg: { + auth: { + cooldowns: { + billingBackoffHoursByProvider: { Anthropic: 1 }, + billingMaxHours: 2, + }, + }, + } as never, + }); + + const disabledUntil = + store.usageStats?.["anthropic:default"]?.disabledUntil; + expect(typeof disabledUntil).toBe("number"); + const remainingMs = (disabledUntil as number) - startedAt; + expect(remainingMs).toBeGreaterThan(0.8 * 60 * 60 * 1000); + expect(remainingMs).toBeLessThan(1.2 * 60 * 60 * 1000); + } finally { + fs.rmSync(agentDir, { recursive: true, force: true }); + } + }); + + it("resets backoff counters outside the failure window", async () => { + const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "clawdbot-auth-")); + try { + const authPath = path.join(agentDir, "auth-profiles.json"); + const now = Date.now(); + fs.writeFileSync( + authPath, + JSON.stringify({ + version: 1, + profiles: { + "anthropic:default": { + type: "api_key", + provider: "anthropic", + key: "sk-default", + }, + }, + usageStats: { + "anthropic:default": { + errorCount: 9, + failureCounts: { billing: 3 }, + lastFailureAt: now - 48 * 60 * 60 * 1000, + }, + }, + }), + ); + + const store = ensureAuthProfileStore(agentDir); + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "billing", + agentDir, + cfg: { + auth: { cooldowns: { failureWindowHours: 24 } }, + } as never, + }); + + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(1); + expect( + store.usageStats?.["anthropic:default"]?.failureCounts?.billing, + ).toBe(1); + } finally { + fs.rmSync(agentDir, { recursive: true, force: true }); + } + }); +}); + describe("external CLI credential sync", () => { it("syncs Claude CLI credentials into anthropic:claude-cli", async () => { const agentDir = fs.mkdtempSync( diff --git a/src/agents/auth-profiles.ts b/src/agents/auth-profiles.ts index 9d62250bb..77549296d 100644 --- a/src/agents/auth-profiles.ts +++ b/src/agents/auth-profiles.ts @@ -87,6 +87,7 @@ export type ProfileUsageStats = { disabledReason?: AuthProfileFailureReason; errorCount?: number; failureCounts?: Partial>; + lastFailureAt?: number; }; export type AuthProfileStore = { @@ -841,15 +842,17 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number { ); } -function calculateAuthProfileBillingDisableMs(errorCount: number): number { - const normalized = Math.max(1, errorCount); - const steps = [ - 30 * 60 * 1000, // 30 min - 2 * 60 * 60 * 1000, // 2 hours - 8 * 60 * 60 * 1000, // 8 hours - 24 * 60 * 60 * 1000, // 24 hours - ]; - return steps[Math.min(normalized - 1, steps.length - 1)] as number; +function calculateAuthProfileBillingDisableMsWithConfig(params: { + errorCount: number; + baseMs: number; + maxMs: number; +}): number { + const normalized = Math.max(1, params.errorCount); + const baseMs = Math.max(60_000, params.baseMs); + const maxMs = Math.max(baseMs, params.maxMs); + const exponent = Math.min(normalized - 1, 10); + const raw = baseMs * 2 ** exponent; + return Math.min(maxMs, raw); } function resolveProfileUnusableUntil(stats: ProfileUsageStats): number | null { @@ -877,30 +880,85 @@ export async function markAuthProfileFailure(params: { store: AuthProfileStore; profileId: string; reason: AuthProfileFailureReason; + cfg?: ClawdbotConfig; agentDir?: string; }): Promise { - const { store, profileId, reason, agentDir } = params; + const { store, profileId, reason, agentDir, cfg } = params; + const defaults = { + billingBackoffHours: 5, + billingMaxHours: 24, + failureWindowHours: 24, + } as const; + const resolveHours = (value: unknown, fallback: number) => + typeof value === "number" && Number.isFinite(value) && value > 0 + ? value + : fallback; + const resolveCooldownConfig = (providerId: string) => { + const cooldowns = cfg?.auth?.cooldowns; + const billingOverride = (() => { + const map = cooldowns?.billingBackoffHoursByProvider; + if (!map) return undefined; + for (const [key, value] of Object.entries(map)) { + if (normalizeProviderId(key) === providerId) return value; + } + return undefined; + })(); + const billingBackoffHours = resolveHours( + billingOverride ?? cooldowns?.billingBackoffHours, + defaults.billingBackoffHours, + ); + const billingMaxHours = resolveHours( + cooldowns?.billingMaxHours, + defaults.billingMaxHours, + ); + const failureWindowHours = resolveHours( + cooldowns?.failureWindowHours, + defaults.failureWindowHours, + ); + return { + billingBackoffMs: billingBackoffHours * 60 * 60 * 1000, + billingMaxMs: billingMaxHours * 60 * 60 * 1000, + failureWindowMs: failureWindowHours * 60 * 60 * 1000, + }; + }; + const updated = await updateAuthProfileStoreWithLock({ agentDir, updater: (freshStore) => { - if (!freshStore.profiles[profileId]) return false; + const profile = freshStore.profiles[profileId]; + if (!profile) return false; freshStore.usageStats = freshStore.usageStats ?? {}; const existing = freshStore.usageStats[profileId] ?? {}; - const nextErrorCount = (existing.errorCount ?? 0) + 1; - const failureCounts = { ...existing.failureCounts }; + const now = Date.now(); + const providerKey = normalizeProviderId(profile.provider); + const cfgResolved = resolveCooldownConfig(providerKey); + + const windowMs = cfgResolved.failureWindowMs; + const windowExpired = + typeof existing.lastFailureAt === "number" && + existing.lastFailureAt > 0 && + now - existing.lastFailureAt > windowMs; + + const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0); + const nextErrorCount = baseErrorCount + 1; + const failureCounts = windowExpired ? {} : { ...existing.failureCounts }; failureCounts[reason] = (failureCounts[reason] ?? 0) + 1; - const now = Date.now(); const updatedStats: ProfileUsageStats = { ...existing, errorCount: nextErrorCount, failureCounts, + lastFailureAt: now, }; if (reason === "billing") { const billingCount = failureCounts.billing ?? 1; - const backoffMs = calculateAuthProfileBillingDisableMs(billingCount); + const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ + errorCount: billingCount, + baseMs: cfgResolved.billingBackoffMs, + maxMs: cfgResolved.billingMaxMs, + }); updatedStats.disabledUntil = now + backoffMs; updatedStats.disabledReason = "billing"; } else { @@ -920,19 +978,34 @@ export async function markAuthProfileFailure(params: { store.usageStats = store.usageStats ?? {}; const existing = store.usageStats[profileId] ?? {}; - const nextErrorCount = (existing.errorCount ?? 0) + 1; - const failureCounts = { ...existing.failureCounts }; + const now = Date.now(); + const providerKey = normalizeProviderId( + store.profiles[profileId]?.provider ?? "", + ); + const cfgResolved = resolveCooldownConfig(providerKey); + const windowMs = cfgResolved.failureWindowMs; + const windowExpired = + typeof existing.lastFailureAt === "number" && + existing.lastFailureAt > 0 && + now - existing.lastFailureAt > windowMs; + const baseErrorCount = windowExpired ? 0 : (existing.errorCount ?? 0); + const nextErrorCount = baseErrorCount + 1; + const failureCounts = windowExpired ? {} : { ...existing.failureCounts }; failureCounts[reason] = (failureCounts[reason] ?? 0) + 1; - const now = Date.now(); const updatedStats: ProfileUsageStats = { ...existing, errorCount: nextErrorCount, failureCounts, + lastFailureAt: now, }; if (reason === "billing") { const billingCount = failureCounts.billing ?? 1; - const backoffMs = calculateAuthProfileBillingDisableMs(billingCount); + const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ + errorCount: billingCount, + baseMs: cfgResolved.billingBackoffMs, + maxMs: cfgResolved.billingMaxMs, + }); updatedStats.disabledUntil = now + backoffMs; updatedStats.disabledReason = "billing"; } else { diff --git a/src/agents/claude-cli-runner.ts b/src/agents/claude-cli-runner.ts index 3b67e131c..26ca8090b 100644 --- a/src/agents/claude-cli-runner.ts +++ b/src/agents/claude-cli-runner.ts @@ -9,9 +9,12 @@ import { shouldLogVerbose } from "../globals.js"; import { createSubsystemLogger } from "../logging.js"; import { runCommandWithTimeout } from "../process/exec.js"; import { resolveUserPath } from "../utils.js"; +import { FailoverError, resolveFailoverStatus } from "./failover-error.js"; import { buildBootstrapContextFiles, + classifyFailoverReason, type EmbeddedContextFile, + isFailoverErrorMessage, } from "./pi-embedded-helpers.js"; import type { EmbeddedPiRunResult } from "./pi-embedded-runner.js"; import { buildAgentSystemPrompt } from "./system-prompt.js"; @@ -310,6 +313,16 @@ async function runClaudeCliOnce(params: { } if (result.code !== 0) { const err = result.stderr.trim() || stdout || "Claude CLI failed."; + if (isFailoverErrorMessage(err)) { + const reason = classifyFailoverReason(err) ?? "unknown"; + const status = resolveFailoverStatus(reason); + throw new FailoverError(err, { + reason, + provider: "claude-cli", + model: params.modelId, + status, + }); + } throw new Error(err); } const parsed = parseClaudeCliJson(stdout); diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts new file mode 100644 index 000000000..fcebd1e87 --- /dev/null +++ b/src/agents/failover-error.ts @@ -0,0 +1,53 @@ +import type { FailoverReason } from "./pi-embedded-helpers.js"; + +export class FailoverError extends Error { + readonly reason: FailoverReason; + readonly provider?: string; + readonly model?: string; + readonly profileId?: string; + readonly status?: number; + readonly code?: string; + + constructor( + message: string, + params: { + reason: FailoverReason; + provider?: string; + model?: string; + profileId?: string; + status?: number; + code?: string; + cause?: unknown; + }, + ) { + super(message, { cause: params.cause }); + this.name = "FailoverError"; + this.reason = params.reason; + this.provider = params.provider; + this.model = params.model; + this.profileId = params.profileId; + this.status = params.status; + this.code = params.code; + } +} + +export function isFailoverError(err: unknown): err is FailoverError { + return err instanceof FailoverError; +} + +export function resolveFailoverStatus( + reason: FailoverReason, +): number | undefined { + switch (reason) { + case "billing": + return 402; + case "rate_limit": + return 429; + case "auth": + return 401; + case "timeout": + return 408; + default: + return undefined; + } +} diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index acd331a26..607d69094 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -1,5 +1,6 @@ import type { ClawdbotConfig } from "../config/config.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js"; +import { type FailoverError, isFailoverError } from "./failover-error.js"; import { buildModelAliasIndex, modelKey, @@ -7,6 +8,7 @@ import { resolveConfiguredModelRef, resolveModelRefFromString, } from "./model-selection.js"; +import type { FailoverReason } from "./pi-embedded-helpers.js"; import { isFailoverErrorMessage } from "./pi-embedded-helpers.js"; type ModelCandidate = { @@ -18,6 +20,9 @@ type FallbackAttempt = { provider: string; model: string; error: string; + reason?: FailoverReason; + status?: number; + code?: string; }; function isAbortError(err: unknown): boolean { @@ -67,9 +72,32 @@ function getErrorMessage(err: unknown): string { return ""; } +function describeFallbackError(err: unknown): { + message: string; + reason?: FailoverReason; + status?: number; + code?: string; +} { + if (isFailoverError(err)) { + const fe = err as FailoverError; + return { + message: fe.message, + reason: fe.reason, + status: fe.status, + code: fe.code, + }; + } + return { + message: getErrorMessage(err) || String(err), + status: getStatusCode(err) ?? undefined, + code: getErrorCode(err) || undefined, + }; +} + function shouldFallbackForError(err: unknown): boolean { + if (isFailoverError(err)) return true; const statusCode = getStatusCode(err); - if (statusCode && [401, 402, 403, 429].includes(statusCode)) return true; + if (statusCode && [401, 402, 403, 408, 429].includes(statusCode)) return true; const code = getErrorCode(err).toUpperCase(); if ( ["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes( @@ -265,10 +293,14 @@ export async function runWithModelFallback(params: { const shouldFallback = shouldFallbackForError(err); if (!shouldFallback) throw err; lastError = err; + const described = describeFallbackError(err); attempts.push({ provider: candidate.provider, model: candidate.model, - error: err instanceof Error ? err.message : String(err), + error: described.message, + reason: described.reason, + status: described.status, + code: described.code, }); await params.onError?.({ provider: candidate.provider, @@ -286,7 +318,9 @@ export async function runWithModelFallback(params: { ? attempts .map( (attempt) => - `${attempt.provider}/${attempt.model}: ${attempt.error}`, + `${attempt.provider}/${attempt.model}: ${attempt.error}${ + attempt.reason ? ` (${attempt.reason})` : "" + }`, ) .join(" | ") : "unknown"; diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index b6482a4f1..ca9635ebd 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -47,6 +47,7 @@ import { DEFAULT_MODEL, DEFAULT_PROVIDER, } from "./defaults.js"; +import { FailoverError, resolveFailoverStatus } from "./failover-error.js"; import { ensureAuthProfileStore, getApiKeyForModel, @@ -1448,6 +1449,7 @@ export async function runEmbeddedPiAgent(params: { store: authStore, profileId: lastProfileId, reason: promptFailoverReason, + cfg: params.config, agentDir: params.agentDir, }); } @@ -1515,6 +1517,7 @@ export async function runEmbeddedPiAgent(params: { store: authStore, profileId: lastProfileId, reason, + cfg: params.config, agentDir: params.agentDir, }); if (timedOut) { @@ -1540,22 +1543,16 @@ export async function runEmbeddedPiAgent(params: { : authFailure ? "LLM request unauthorized." : "LLM request failed."); - const err = new Error(message); - (err as { failoverReason?: string }).failoverReason = - assistantFailoverReason ?? undefined; - if (assistantFailoverReason === "billing") { - (err as { status?: number }).status = 402; - } else if (assistantFailoverReason === "rate_limit") { - (err as { status?: number }).status = 429; - } else if (assistantFailoverReason === "auth") { - (err as { status?: number }).status = 401; - } else if ( - assistantFailoverReason === "timeout" || - isTimeoutErrorMessage(message) - ) { - (err as { status?: number }).status = 408; - } - throw err; + const status = + resolveFailoverStatus(assistantFailoverReason ?? "unknown") ?? + (isTimeoutErrorMessage(message) ? 408 : undefined); + throw new FailoverError(message, { + reason: assistantFailoverReason ?? "unknown", + provider, + model: modelId, + profileId: lastProfileId, + status, + }); } } diff --git a/src/commands/doctor-auth.ts b/src/commands/doctor-auth.ts index 6c2bb9e9a..270acb7b1 100644 --- a/src/commands/doctor-auth.ts +++ b/src/commands/doctor-auth.ts @@ -11,6 +11,7 @@ import { ensureAuthProfileStore, repairOAuthProfileIdMismatch, resolveApiKeyForProfile, + resolveProfileUnusableUntilForDisplay, } from "../agents/auth-profiles.js"; import type { ClawdbotConfig } from "../config/config.js"; import { stylePromptTitle } from "../terminal/prompt-style.js"; @@ -81,6 +82,32 @@ export async function noteAuthProfileHealth(params: { const store = ensureAuthProfileStore(undefined, { allowKeychainPrompt: params.allowKeychainPrompt, }); + const unusable = (() => { + const now = Date.now(); + const out: string[] = []; + for (const profileId of Object.keys(store.usageStats ?? {})) { + const until = resolveProfileUnusableUntilForDisplay(store, profileId); + if (!until || now >= until) continue; + const stats = store.usageStats?.[profileId]; + const remaining = formatRemainingShort(until - now); + const kind = + typeof stats?.disabledUntil === "number" && now < stats.disabledUntil + ? `disabled${stats.disabledReason ? `:${stats.disabledReason}` : ""}` + : "cooldown"; + const hint = kind.startsWith("disabled:billing") + ? "Top up credits (provider billing) or switch provider." + : "Wait for cooldown or switch provider."; + out.push( + `- ${profileId}: ${kind} (${remaining})${hint ? ` — ${hint}` : ""}`, + ); + } + return out; + })(); + + if (unusable.length > 0) { + note(unusable.join("\n"), "Auth profile cooldowns"); + } + let summary = buildAuthHealthSummary({ store, cfg: params.cfg, diff --git a/src/commands/models.list.test.ts b/src/commands/models.list.test.ts index 2d7e339d8..3a5b34ef0 100644 --- a/src/commands/models.list.test.ts +++ b/src/commands/models.list.test.ts @@ -13,6 +13,7 @@ const resolveAuthProfileDisplayLabel = vi.fn( const resolveAuthStorePathForDisplay = vi .fn() .mockReturnValue("/tmp/clawdbot-agent/auth-profiles.json"); +const resolveProfileUnusableUntilForDisplay = vi.fn().mockReturnValue(null); const resolveEnvApiKey = vi.fn().mockReturnValue(undefined); const getCustomProviderApiKey = vi.fn().mockReturnValue(undefined); const discoverAuthStorage = vi.fn().mockReturnValue({}); @@ -36,6 +37,7 @@ vi.mock("../agents/auth-profiles.js", () => ({ listProfilesForProvider, resolveAuthProfileDisplayLabel, resolveAuthStorePathForDisplay, + resolveProfileUnusableUntilForDisplay, })); vi.mock("../agents/model-auth.js", () => ({ diff --git a/src/commands/models/list.ts b/src/commands/models/list.ts index d08b753e0..613cc42c4 100644 --- a/src/commands/models/list.ts +++ b/src/commands/models/list.ts @@ -18,6 +18,7 @@ import { listProfilesForProvider, resolveAuthProfileDisplayLabel, resolveAuthStorePathForDisplay, + resolveProfileUnusableUntilForDisplay, } from "../../agents/auth-profiles.js"; import { getCustomProviderApiKey, @@ -174,15 +175,36 @@ function resolveProviderAuthOverview(params: { modelsPath: string; }): ProviderAuthOverview { const { provider, cfg, store } = params; + const now = Date.now(); const profiles = listProfilesForProvider(store, provider); + const withUnusableSuffix = (base: string, profileId: string) => { + const unusableUntil = resolveProfileUnusableUntilForDisplay( + store, + profileId, + ); + if (!unusableUntil || now >= unusableUntil) return base; + const stats = store.usageStats?.[profileId]; + const kind = + typeof stats?.disabledUntil === "number" && now < stats.disabledUntil + ? `disabled${stats.disabledReason ? `:${stats.disabledReason}` : ""}` + : "cooldown"; + const remaining = formatRemainingShort(unusableUntil - now); + return `${base} [${kind} ${remaining}]`; + }; const labels = profiles.map((profileId) => { const profile = store.profiles[profileId]; if (!profile) return `${profileId}=missing`; if (profile.type === "api_key") { - return `${profileId}=${maskApiKey(profile.key)}`; + return withUnusableSuffix( + `${profileId}=${maskApiKey(profile.key)}`, + profileId, + ); } if (profile.type === "token") { - return `${profileId}=token:${maskApiKey(profile.token)}`; + return withUnusableSuffix( + `${profileId}=token:${maskApiKey(profile.token)}`, + profileId, + ); } const display = resolveAuthProfileDisplayLabel({ cfg, store, profileId }); const suffix = @@ -191,7 +213,8 @@ function resolveProviderAuthOverview(params: { : display.startsWith(profileId) ? display.slice(profileId.length).trim() : `(${display})`; - return `${profileId}=OAuth${suffix ? ` ${suffix}` : ""}`; + const base = `${profileId}=OAuth${suffix ? ` ${suffix}` : ""}`; + return withUnusableSuffix(base, profileId); }); const oauthCount = profiles.filter( (id) => store.profiles[id]?.type === "oauth", @@ -770,6 +793,39 @@ export async function modelsStatusCommand( (profile) => profile.type === "oauth" || profile.type === "token", ); + const unusableProfiles = (() => { + const now = Date.now(); + const out: Array<{ + profileId: string; + provider?: string; + kind: "cooldown" | "disabled"; + reason?: string; + until: number; + remainingMs: number; + }> = []; + for (const profileId of Object.keys(store.usageStats ?? {})) { + const unusableUntil = resolveProfileUnusableUntilForDisplay( + store, + profileId, + ); + if (!unusableUntil || now >= unusableUntil) continue; + const stats = store.usageStats?.[profileId]; + const kind = + typeof stats?.disabledUntil === "number" && now < stats.disabledUntil + ? "disabled" + : "cooldown"; + out.push({ + profileId, + provider: store.profiles[profileId]?.provider, + kind, + reason: stats?.disabledReason, + until: unusableUntil, + remainingMs: unusableUntil - now, + }); + } + return out.sort((a, b) => a.remainingMs - b.remainingMs); + })(); + const checkStatus = (() => { const hasExpiredOrMissing = oauthProfiles.some((profile) => @@ -805,6 +861,7 @@ export async function modelsStatusCommand( providersWithOAuth: providersWithOauth, missingProvidersInUse, providers: providerAuth, + unusableProfiles, oauth: { warnAfterMs: authHealth.warnAfterMs, profiles: authHealth.profiles, diff --git a/src/config/schema.ts b/src/config/schema.ts index 06df7c2ce..0c4b7f4bb 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -95,6 +95,10 @@ const FIELD_LABELS: Record = { "agents.defaults.workspace": "Workspace", "auth.profiles": "Auth Profiles", "auth.order": "Auth Profile Order", + "auth.cooldowns.billingBackoffHours": "Billing Backoff (hours)", + "auth.cooldowns.billingBackoffHoursByProvider": "Billing Backoff Overrides", + "auth.cooldowns.billingMaxHours": "Billing Backoff Cap (hours)", + "auth.cooldowns.failureWindowHours": "Failover Window (hours)", "agents.defaults.models": "Models", "agents.defaults.model.primary": "Primary Model", "agents.defaults.model.fallbacks": "Model Fallbacks", @@ -156,6 +160,14 @@ const FIELD_HELP: Record = { "auth.profiles": "Named auth profiles (provider + mode + optional email).", "auth.order": "Ordered auth profile IDs per provider (used for automatic failover).", + "auth.cooldowns.billingBackoffHours": + "Base backoff (hours) when a profile fails due to billing/insufficient credits (default: 5).", + "auth.cooldowns.billingBackoffHoursByProvider": + "Optional per-provider overrides for billing backoff (hours).", + "auth.cooldowns.billingMaxHours": + "Cap (hours) for billing backoff (default: 24).", + "auth.cooldowns.failureWindowHours": + "Failure window (hours) for backoff counters (default: 24).", "agents.defaults.models": "Configured model catalog (keys are full provider/model IDs).", "agents.defaults.model.primary": "Primary model (provider/model).", diff --git a/src/config/types.ts b/src/config/types.ts index 569a1173f..0791c91ce 100644 --- a/src/config/types.ts +++ b/src/config/types.ts @@ -1210,6 +1210,19 @@ export type AuthProfileConfig = { export type AuthConfig = { profiles?: Record; order?: Record; + cooldowns?: { + /** Default billing backoff (hours). Default: 5. */ + billingBackoffHours?: number; + /** Optional per-provider billing backoff (hours). */ + billingBackoffHoursByProvider?: Record; + /** Billing backoff cap (hours). Default: 24. */ + billingMaxHours?: number; + /** + * Failure window for backoff counters (hours). If no failures occur within + * this window, counters reset. Default: 24. + */ + failureWindowHours?: number; + }; }; export type AgentModelEntryConfig = { diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 9199b64a8..c8bd5f19e 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -1198,6 +1198,16 @@ export const ClawdbotSchema = z ) .optional(), order: z.record(z.string(), z.array(z.string())).optional(), + cooldowns: z + .object({ + billingBackoffHours: z.number().positive().optional(), + billingBackoffHoursByProvider: z + .record(z.string(), z.number().positive()) + .optional(), + billingMaxHours: z.number().positive().optional(), + failureWindowHours: z.number().positive().optional(), + }) + .optional(), }) .optional(), models: ModelsConfigSchema,