fix: improve auth profile failover

2026-01-06 07:18:06 +01:00
parent a7b5753dc4
commit aa16b679ad
7 changed files with 191 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - Groups: `whatsapp.groups`, `telegram.groups`, and `imessage.groups` now act as allowlists when set. Add `"*"` to keep allow-all behavior.

 ### Fixes
+- Messages: stop defaulting ack reactions to 👀 when identity emoji is missing.
 - Auto-reply: require slash for control commands to avoid false triggers in normal text.
 - Auto-reply: treat steer during compaction as a follow-up, queued until compaction completes.
 - Auth: lock auth profile refreshes to avoid multi-instance OAuth logouts; keep credentials on refresh failure.
@@ -67,6 +68,7 @@
 - Discord: use channel IDs for DMs instead of user IDs. Thanks @VACInc for PR #261.
 - Docs: clarify Slack manifest scopes (current vs optional) with references. Thanks @jarvis-medmatic for PR #235.
 - Control UI: avoid Slack config ReferenceError by reading slack config snapshots. Thanks @sreekaransrinath for PR #249.
+- Auth: rotate across multiple OAuth profiles with cooldown tracking and email-based profile IDs. Thanks @mukhtharcm for PR #269.
 - Telegram: honor routing.groupChat.mentionPatterns for group mention gating. Thanks Kevin Kern (@regenrek) for PR #242.
 - Telegram: gate groups via `telegram.groups` allowlist (align with WhatsApp/iMessage). Thanks @kitze for PR #241.
 - Telegram: support media groups (multi-image messages). Thanks @obviyus for PR #220.
--- a/docs/model-failover.md
+++ b/docs/model-failover.md
@@ -0,0 +1,75 @@
+---
+summary: "How Clawdbot rotates auth profiles and falls back across models"
+read_when:
+  - Diagnosing auth profile rotation, cooldowns, or model fallback behavior
+  - Updating failover rules for auth profiles or models
+---
+
+# Model failover
+
+Clawdbot handles failures in two stages:
+1) **Auth profile rotation** within the current provider.
+2) **Model fallback** to the next model in `agent.model.fallbacks`.
+
+This doc explains the runtime rules and the data that backs them.
+
+## Profile IDs
+
+OAuth logins create distinct profiles so multiple accounts can coexist.
+- Default: `provider:default` when no email is available.
+- OAuth with email: `provider:<email>` (for example `google-antigravity:user@gmail.com`).
+
+Profiles live in `~/.clawdbot/agent/auth-profiles.json` under `profiles`.
+
+## Rotation order
+
+When a provider has multiple profiles, Clawdbot chooses an order like this:
+
+1) **Explicit config**: `auth.order[provider]` (if set).
+2) **Configured profiles**: `auth.profiles` filtered by provider.
+3) **Stored profiles**: entries in `auth-profiles.json` for the provider.
+
+If no explicit order is configured, Clawdbot uses a round‑robin order:
+- **Primary key:** `usageStats.lastUsed` (oldest first).
+- **Secondary key:** profile type (OAuth before API keys).
+- **Cooldown profiles** are moved to the end, ordered by soonest cooldown expiry.
+
+## Cooldowns
+
+When a profile fails due to auth/rate‑limit errors (or a timeout that looks
+like rate limiting), Clawdbot marks it in cooldown and moves to the next profile.
+
+Cooldowns use exponential backoff:
+- 1 minute
+- 5 minutes
+- 25 minutes
+- 1 hour (cap)
+
+State is stored in `auth-profiles.json` under `usageStats`:
+
+```json
+{
+  "usageStats": {
+    "provider:profile": {
+      "lastUsed": 1736160000000,
+      "cooldownUntil": 1736160600000,
+      "errorCount": 2
+    }
+  }
+}
+```
+
+## Model fallback
+
+If all profiles for a provider fail, Clawdbot moves to the next model in
+`agent.model.fallbacks`. This applies to auth failures, rate limits, and
+timeouts that exhausted profile rotation.
+
+## Related config
+
+See `docs/configuration.md` for:
+- `auth.profiles` / `auth.order`
+- `agent.model.primary` / `agent.model.fallbacks`
+- `agent.imageModel` routing
+
+See `docs/models.md` for the broader model selection and fallback overview.
--- a/docs/models.md
+++ b/docs/models.md
@@ -77,6 +77,7 @@ Output
 - Image routing uses `agent.imageModel` **only when configured** and the primary
  model lacks image input.
 - Persist last successful provider/model to session entry; auth profile success is global.
+- See `docs/model-failover.md` for auth profile rotation, cooldowns, and timeout handling.

 ## Tests

--- a/src/agents/auth-profiles.test.ts
+++ b/src/agents/auth-profiles.test.ts
@@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";

 import {
  type AuthProfileStore,
+  calculateAuthProfileCooldownMs,
  resolveAuthProfileOrder,
 } from "./auth-profiles.js";

@@ -105,4 +106,87 @@ describe("resolveAuthProfileOrder", () => {
    });
    expect(order).toEqual(["anthropic:oauth", "anthropic:default"]);
  });
+
+  it("orders by lastUsed when no explicit order exists", () => {
+    const order = resolveAuthProfileOrder({
+      store: {
+        version: 1,
+        profiles: {
+          "anthropic:a": {
+            type: "oauth",
+            provider: "anthropic",
+            access: "access-token",
+            refresh: "refresh-token",
+            expires: Date.now() + 60_000,
+          },
+          "anthropic:b": {
+            type: "api_key",
+            provider: "anthropic",
+            key: "sk-b",
+          },
+          "anthropic:c": {
+            type: "api_key",
+            provider: "anthropic",
+            key: "sk-c",
+          },
+        },
+        usageStats: {
+          "anthropic:a": { lastUsed: 200 },
+          "anthropic:b": { lastUsed: 100 },
+          "anthropic:c": { lastUsed: 300 },
+        },
+      },
+      provider: "anthropic",
+    });
+    expect(order).toEqual(["anthropic:b", "anthropic:a", "anthropic:c"]);
+  });
+
+  it("pushes cooldown profiles to the end, ordered by cooldown expiry", () => {
+    const now = Date.now();
+    const order = resolveAuthProfileOrder({
+      store: {
+        version: 1,
+        profiles: {
+          "anthropic:ready": {
+            type: "api_key",
+            provider: "anthropic",
+            key: "sk-ready",
+          },
+          "anthropic:cool1": {
+            type: "oauth",
+            provider: "anthropic",
+            access: "access-token",
+            refresh: "refresh-token",
+            expires: now + 60_000,
+          },
+          "anthropic:cool2": {
+            type: "api_key",
+            provider: "anthropic",
+            key: "sk-cool",
+          },
+        },
+        usageStats: {
+          "anthropic:ready": { lastUsed: 50 },
+          "anthropic:cool1": { cooldownUntil: now + 5_000 },
+          "anthropic:cool2": { cooldownUntil: now + 1_000 },
+        },
+      },
+      provider: "anthropic",
+    });
+    expect(order).toEqual([
+      "anthropic:ready",
+      "anthropic:cool2",
+      "anthropic:cool1",
+    ]);
+  });
+});
+
+describe("auth profile cooldowns", () => {
+  it("applies exponential backoff with a 1h cap", () => {
+    expect(calculateAuthProfileCooldownMs(1)).toBe(60_000);
+    expect(calculateAuthProfileCooldownMs(2)).toBe(5 * 60_000);
+    expect(calculateAuthProfileCooldownMs(3)).toBe(25 * 60_000);
+    expect(calculateAuthProfileCooldownMs(4)).toBe(60 * 60_000);
+    expect(calculateAuthProfileCooldownMs(5)).toBe(60 * 60_000);
+  });
 });
--- a/src/agents/auth-profiles.ts
+++ b/src/agents/auth-profiles.ts
@@ -368,6 +368,14 @@ export function markAuthProfileUsed(params: {
  saveAuthProfileStore(store);
 }

+export function calculateAuthProfileCooldownMs(errorCount: number): number {
+  const normalized = Math.max(1, errorCount);
+  return Math.min(
+    60 * 60 * 1000, // 1 hour max
+    60 * 1000 * 5 ** Math.min(normalized - 1, 3),
+  );
+}
+
 /**
 * Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
 * Cooldown times: 1min, 5min, 25min, max 1 hour.
@@ -384,10 +392,7 @@ export function markAuthProfileCooldown(params: {
  const errorCount = (existing.errorCount ?? 0) + 1;

  // Exponential backoff: 1min, 5min, 25min, capped at 1h
-  const backoffMs = Math.min(
-    60 * 60 * 1000, // 1 hour max
-    60 * 1000 * Math.pow(5, Math.min(errorCount - 1, 3)),
-  );
+  const backoffMs = calculateAuthProfileCooldownMs(errorCount);

  store.usageStats[profileId] = {
    ...existing,
--- a/src/agents/pi-embedded-runner.ts
+++ b/src/agents/pi-embedded-runner.ts
@@ -24,7 +24,11 @@ import {
 } from "../process/command-queue.js";
 import { resolveUserPath } from "../utils.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";
-import { markAuthProfileGood, markAuthProfileUsed, markAuthProfileCooldown } from "./auth-profiles.js";
+import {
+  markAuthProfileCooldown,
+  markAuthProfileGood,
+  markAuthProfileUsed,
+} from "./auth-profiles.js";
 import type { BashElevatedDefaults } from "./bash-tools.js";
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import {
@@ -955,14 +959,18 @@ export async function runEmbeddedPiAgent(params: {
            (params.config?.agent?.model?.fallbacks?.length ?? 0) > 0;
          const authFailure = isAuthAssistantError(lastAssistant);
          const rateLimitFailure = isRateLimitAssistantError(lastAssistant);
-          
+
          // Treat timeout as potential rate limit (Antigravity hangs on rate limit)
-          const shouldRotate = (!aborted && (authFailure || rateLimitFailure)) || timedOut;
-          
+          const shouldRotate =
+            (!aborted && (authFailure || rateLimitFailure)) || timedOut;
+
          if (shouldRotate) {
            // Mark current profile for cooldown before rotating
            if (lastProfileId) {
-              markAuthProfileCooldown({ store: authStore, profileId: lastProfileId });
+              markAuthProfileCooldown({
+                store: authStore,
+                profileId: lastProfileId,
+              });
              if (timedOut) {
                log.warn(
                  `Profile ${lastProfileId} timed out (possible rate limit). Trying next account...`,
@@ -973,15 +981,17 @@ export async function runEmbeddedPiAgent(params: {
            if (rotated) {
              continue;
            }
-            if (fallbackConfigured && !timedOut) {
+            if (fallbackConfigured) {
              const message =
                lastAssistant?.errorMessage?.trim() ||
                (lastAssistant
                  ? formatAssistantErrorText(lastAssistant)
                  : "") ||
-                (rateLimitFailure
-                  ? "LLM request rate limited."
-                  : "LLM request unauthorized.");
+                (timedOut
+                  ? "LLM request timed out."
+                  : rateLimitFailure
+                    ? "LLM request rate limited."
+                    : "LLM request unauthorized.");
              throw new Error(message);
            }
          }
--- a/src/gateway/protocol/index.ts
+++ b/src/gateway/protocol/index.ts
@@ -394,4 +394,5 @@ export type {
  CronRunParams,
  CronRunsParams,
  CronRunLogEntry,
+  PollParams,
 };