diff --git a/CHANGELOG.md b/CHANGELOG.md index acaa581fa..9d2b7d6e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Groups: `whatsapp.groups`, `telegram.groups`, and `imessage.groups` now act as allowlists when set. Add `"*"` to keep allow-all behavior. ### Fixes +- Messages: stop defaulting ack reactions to đŸ‘€ when identity emoji is missing. - Auto-reply: require slash for control commands to avoid false triggers in normal text. - Auto-reply: treat steer during compaction as a follow-up, queued until compaction completes. - Auth: lock auth profile refreshes to avoid multi-instance OAuth logouts; keep credentials on refresh failure. @@ -67,6 +68,7 @@ - Discord: use channel IDs for DMs instead of user IDs. Thanks @VACInc for PR #261. - Docs: clarify Slack manifest scopes (current vs optional) with references. Thanks @jarvis-medmatic for PR #235. - Control UI: avoid Slack config ReferenceError by reading slack config snapshots. Thanks @sreekaransrinath for PR #249. +- Auth: rotate across multiple OAuth profiles with cooldown tracking and email-based profile IDs. Thanks @mukhtharcm for PR #269. - Telegram: honor routing.groupChat.mentionPatterns for group mention gating. Thanks Kevin Kern (@regenrek) for PR #242. - Telegram: gate groups via `telegram.groups` allowlist (align with WhatsApp/iMessage). Thanks @kitze for PR #241. - Telegram: support media groups (multi-image messages). Thanks @obviyus for PR #220. diff --git a/docs/model-failover.md b/docs/model-failover.md new file mode 100644 index 000000000..9b5ee948f --- /dev/null +++ b/docs/model-failover.md @@ -0,0 +1,75 @@ +--- +summary: "How Clawdbot rotates auth profiles and falls back across models" +read_when: + - Diagnosing auth profile rotation, cooldowns, or model fallback behavior + - Updating failover rules for auth profiles or models +--- + +# Model failover + +Clawdbot handles failures in two stages: +1) **Auth profile rotation** within the current provider. +2) **Model fallback** to the next model in `agent.model.fallbacks`. + +This doc explains the runtime rules and the data that backs them. + +## Profile IDs + +OAuth logins create distinct profiles so multiple accounts can coexist. +- Default: `provider:default` when no email is available. +- OAuth with email: `provider:` (for example `google-antigravity:user@gmail.com`). + +Profiles live in `~/.clawdbot/agent/auth-profiles.json` under `profiles`. + +## Rotation order + +When a provider has multiple profiles, Clawdbot chooses an order like this: + +1) **Explicit config**: `auth.order[provider]` (if set). +2) **Configured profiles**: `auth.profiles` filtered by provider. +3) **Stored profiles**: entries in `auth-profiles.json` for the provider. + +If no explicit order is configured, Clawdbot uses a round‑robin order: +- **Primary key:** `usageStats.lastUsed` (oldest first). +- **Secondary key:** profile type (OAuth before API keys). +- **Cooldown profiles** are moved to the end, ordered by soonest cooldown expiry. + +## Cooldowns + +When a profile fails due to auth/rate‑limit errors (or a timeout that looks +like rate limiting), Clawdbot marks it in cooldown and moves to the next profile. + +Cooldowns use exponential backoff: +- 1 minute +- 5 minutes +- 25 minutes +- 1 hour (cap) + +State is stored in `auth-profiles.json` under `usageStats`: + +```json +{ + "usageStats": { + "provider:profile": { + "lastUsed": 1736160000000, + "cooldownUntil": 1736160600000, + "errorCount": 2 + } + } +} +``` + +## Model fallback + +If all profiles for a provider fail, Clawdbot moves to the next model in +`agent.model.fallbacks`. This applies to auth failures, rate limits, and +timeouts that exhausted profile rotation. + +## Related config + +See `docs/configuration.md` for: +- `auth.profiles` / `auth.order` +- `agent.model.primary` / `agent.model.fallbacks` +- `agent.imageModel` routing + +See `docs/models.md` for the broader model selection and fallback overview. diff --git a/docs/models.md b/docs/models.md index cf88065a5..bb4f47344 100644 --- a/docs/models.md +++ b/docs/models.md @@ -77,6 +77,7 @@ Output - Image routing uses `agent.imageModel` **only when configured** and the primary model lacks image input. - Persist last successful provider/model to session entry; auth profile success is global. +- See `docs/model-failover.md` for auth profile rotation, cooldowns, and timeout handling. ## Tests diff --git a/src/agents/auth-profiles.test.ts b/src/agents/auth-profiles.test.ts index 493f2c09d..ea5d5fdcb 100644 --- a/src/agents/auth-profiles.test.ts +++ b/src/agents/auth-profiles.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest"; import { type AuthProfileStore, + calculateAuthProfileCooldownMs, resolveAuthProfileOrder, } from "./auth-profiles.js"; @@ -105,4 +106,87 @@ describe("resolveAuthProfileOrder", () => { }); expect(order).toEqual(["anthropic:oauth", "anthropic:default"]); }); + + it("orders by lastUsed when no explicit order exists", () => { + const order = resolveAuthProfileOrder({ + store: { + version: 1, + profiles: { + "anthropic:a": { + type: "oauth", + provider: "anthropic", + access: "access-token", + refresh: "refresh-token", + expires: Date.now() + 60_000, + }, + "anthropic:b": { + type: "api_key", + provider: "anthropic", + key: "sk-b", + }, + "anthropic:c": { + type: "api_key", + provider: "anthropic", + key: "sk-c", + }, + }, + usageStats: { + "anthropic:a": { lastUsed: 200 }, + "anthropic:b": { lastUsed: 100 }, + "anthropic:c": { lastUsed: 300 }, + }, + }, + provider: "anthropic", + }); + expect(order).toEqual(["anthropic:b", "anthropic:a", "anthropic:c"]); + }); + + it("pushes cooldown profiles to the end, ordered by cooldown expiry", () => { + const now = Date.now(); + const order = resolveAuthProfileOrder({ + store: { + version: 1, + profiles: { + "anthropic:ready": { + type: "api_key", + provider: "anthropic", + key: "sk-ready", + }, + "anthropic:cool1": { + type: "oauth", + provider: "anthropic", + access: "access-token", + refresh: "refresh-token", + expires: now + 60_000, + }, + "anthropic:cool2": { + type: "api_key", + provider: "anthropic", + key: "sk-cool", + }, + }, + usageStats: { + "anthropic:ready": { lastUsed: 50 }, + "anthropic:cool1": { cooldownUntil: now + 5_000 }, + "anthropic:cool2": { cooldownUntil: now + 1_000 }, + }, + }, + provider: "anthropic", + }); + expect(order).toEqual([ + "anthropic:ready", + "anthropic:cool2", + "anthropic:cool1", + ]); + }); +}); + +describe("auth profile cooldowns", () => { + it("applies exponential backoff with a 1h cap", () => { + expect(calculateAuthProfileCooldownMs(1)).toBe(60_000); + expect(calculateAuthProfileCooldownMs(2)).toBe(5 * 60_000); + expect(calculateAuthProfileCooldownMs(3)).toBe(25 * 60_000); + expect(calculateAuthProfileCooldownMs(4)).toBe(60 * 60_000); + expect(calculateAuthProfileCooldownMs(5)).toBe(60 * 60_000); + }); }); diff --git a/src/agents/auth-profiles.ts b/src/agents/auth-profiles.ts index 8fa3080d5..43308674c 100644 --- a/src/agents/auth-profiles.ts +++ b/src/agents/auth-profiles.ts @@ -368,6 +368,14 @@ export function markAuthProfileUsed(params: { saveAuthProfileStore(store); } +export function calculateAuthProfileCooldownMs(errorCount: number): number { + const normalized = Math.max(1, errorCount); + return Math.min( + 60 * 60 * 1000, // 1 hour max + 60 * 1000 * 5 ** Math.min(normalized - 1, 3), + ); +} + /** * Mark a profile as failed/rate-limited. Applies exponential backoff cooldown. * Cooldown times: 1min, 5min, 25min, max 1 hour. @@ -384,10 +392,7 @@ export function markAuthProfileCooldown(params: { const errorCount = (existing.errorCount ?? 0) + 1; // Exponential backoff: 1min, 5min, 25min, capped at 1h - const backoffMs = Math.min( - 60 * 60 * 1000, // 1 hour max - 60 * 1000 * Math.pow(5, Math.min(errorCount - 1, 3)), - ); + const backoffMs = calculateAuthProfileCooldownMs(errorCount); store.usageStats[profileId] = { ...existing, diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index eb2bb78f9..b50bf0083 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -24,7 +24,11 @@ import { } from "../process/command-queue.js"; import { resolveUserPath } from "../utils.js"; import { resolveClawdbotAgentDir } from "./agent-paths.js"; -import { markAuthProfileGood, markAuthProfileUsed, markAuthProfileCooldown } from "./auth-profiles.js"; +import { + markAuthProfileCooldown, + markAuthProfileGood, + markAuthProfileUsed, +} from "./auth-profiles.js"; import type { BashElevatedDefaults } from "./bash-tools.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js"; import { @@ -955,14 +959,18 @@ export async function runEmbeddedPiAgent(params: { (params.config?.agent?.model?.fallbacks?.length ?? 0) > 0; const authFailure = isAuthAssistantError(lastAssistant); const rateLimitFailure = isRateLimitAssistantError(lastAssistant); - + // Treat timeout as potential rate limit (Antigravity hangs on rate limit) - const shouldRotate = (!aborted && (authFailure || rateLimitFailure)) || timedOut; - + const shouldRotate = + (!aborted && (authFailure || rateLimitFailure)) || timedOut; + if (shouldRotate) { // Mark current profile for cooldown before rotating if (lastProfileId) { - markAuthProfileCooldown({ store: authStore, profileId: lastProfileId }); + markAuthProfileCooldown({ + store: authStore, + profileId: lastProfileId, + }); if (timedOut) { log.warn( `Profile ${lastProfileId} timed out (possible rate limit). Trying next account...`, @@ -973,15 +981,17 @@ export async function runEmbeddedPiAgent(params: { if (rotated) { continue; } - if (fallbackConfigured && !timedOut) { + if (fallbackConfigured) { const message = lastAssistant?.errorMessage?.trim() || (lastAssistant ? formatAssistantErrorText(lastAssistant) : "") || - (rateLimitFailure - ? "LLM request rate limited." - : "LLM request unauthorized."); + (timedOut + ? "LLM request timed out." + : rateLimitFailure + ? "LLM request rate limited." + : "LLM request unauthorized."); throw new Error(message); } } diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index dd45750c3..16b0b2176 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -394,4 +394,5 @@ export type { CronRunParams, CronRunsParams, CronRunLogEntry, + PollParams, };