refactor(agents): centralize failover handling

2026-01-09 21:31:13 +01:00
parent cfeaa34c16
commit 374aa856f2
7 changed files with 292 additions and 61 deletions
--- a/src/agents/auth-profiles.test.ts
+++ b/src/agents/auth-profiles.test.ts
@@ -151,6 +151,49 @@ describe("resolveAuthProfileOrder", () => {
    expect(order).toEqual(["anthropic:work", "anthropic:default"]);
  });

+  it("pushes disabled profiles to the end even with store order", () => {
+    const now = Date.now();
+    const order = resolveAuthProfileOrder({
+      store: {
+        ...store,
+        order: { anthropic: ["anthropic:default", "anthropic:work"] },
+        usageStats: {
+          "anthropic:default": {
+            disabledUntil: now + 60_000,
+            disabledReason: "billing",
+          },
+          "anthropic:work": { lastUsed: 1 },
+        },
+      },
+      provider: "anthropic",
+    });
+    expect(order).toEqual(["anthropic:work", "anthropic:default"]);
+  });
+
+  it("pushes disabled profiles to the end even with configured order", () => {
+    const now = Date.now();
+    const order = resolveAuthProfileOrder({
+      cfg: {
+        auth: {
+          order: { anthropic: ["anthropic:default", "anthropic:work"] },
+          profiles: cfg.auth.profiles,
+        },
+      },
+      store: {
+        ...store,
+        usageStats: {
+          "anthropic:default": {
+            disabledUntil: now + 60_000,
+            disabledReason: "billing",
+          },
+          "anthropic:work": { lastUsed: 1 },
+        },
+      },
+      provider: "anthropic",
+    });
+    expect(order).toEqual(["anthropic:work", "anthropic:default"]);
+  });
+
  it("normalizes z.ai aliases in auth.order", () => {
    const order = resolveAuthProfileOrder({
      cfg: {
--- a/src/agents/auth-profiles.ts
+++ b/src/agents/auth-profiles.ts
@@ -72,11 +72,21 @@ export type AuthProfileCredential =
  | TokenCredential
  | OAuthCredential;

+export type AuthProfileFailureReason =
+  | "auth"
+  | "rate_limit"
+  | "billing"
+  | "timeout"
+  | "unknown";
+
 /** Per-profile usage statistics for round-robin and cooldown tracking */
 export type ProfileUsageStats = {
  lastUsed?: number;
  cooldownUntil?: number;
+  disabledUntil?: number;
+  disabledReason?: AuthProfileFailureReason;
  errorCount?: number;
+  failureCounts?: Partial<Record<AuthProfileFailureReason, number>>;
 };

 export type AuthProfileStore = {
@@ -772,8 +782,9 @@ export function isProfileInCooldown(
  profileId: string,
 ): boolean {
  const stats = store.usageStats?.[profileId];
-  if (!stats?.cooldownUntil) return false;
-  return Date.now() < stats.cooldownUntil;
+  if (!stats) return false;
+  const unusableUntil = resolveProfileUnusableUntil(stats);
+  return unusableUntil ? Date.now() < unusableUntil : false;
 }

 /**
@@ -796,6 +807,9 @@ export async function markAuthProfileUsed(params: {
        lastUsed: Date.now(),
        errorCount: 0,
        cooldownUntil: undefined,
+        disabledUntil: undefined,
+        disabledReason: undefined,
+        failureCounts: undefined,
      };
      return true;
    },
@@ -812,6 +826,9 @@ export async function markAuthProfileUsed(params: {
    lastUsed: Date.now(),
    errorCount: 0,
    cooldownUntil: undefined,
+    disabledUntil: undefined,
+    disabledReason: undefined,
+    failureCounts: undefined,
  };
  saveAuthProfileStore(store, agentDir);
 }
@@ -824,34 +841,74 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
  );
 }

+function calculateAuthProfileBillingDisableMs(errorCount: number): number {
+  const normalized = Math.max(1, errorCount);
+  const steps = [
+    30 * 60 * 1000, // 30 min
+    2 * 60 * 60 * 1000, // 2 hours
+    8 * 60 * 60 * 1000, // 8 hours
+    24 * 60 * 60 * 1000, // 24 hours
+  ];
+  return steps[Math.min(normalized - 1, steps.length - 1)] as number;
+}
+
+function resolveProfileUnusableUntil(stats: ProfileUsageStats): number | null {
+  const values = [stats.cooldownUntil, stats.disabledUntil]
+    .filter((value): value is number => typeof value === "number")
+    .filter((value) => Number.isFinite(value) && value > 0);
+  if (values.length === 0) return null;
+  return Math.max(...values);
+}
+
+export function resolveProfileUnusableUntilForDisplay(
+  store: AuthProfileStore,
+  profileId: string,
+): number | null {
+  const stats = store.usageStats?.[profileId];
+  if (!stats) return null;
+  return resolveProfileUnusableUntil(stats);
+}
+
 /**
- * Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
- * Cooldown times: 1min, 5min, 25min, max 1 hour.
- * Uses store lock to avoid overwriting concurrent usage updates.
+ * Mark a profile as failed for a specific reason. Billing failures are treated
+ * as "disabled" (longer backoff) vs the regular cooldown window.
 */
-export async function markAuthProfileCooldown(params: {
+export async function markAuthProfileFailure(params: {
  store: AuthProfileStore;
  profileId: string;
+  reason: AuthProfileFailureReason;
  agentDir?: string;
 }): Promise<void> {
-  const { store, profileId, agentDir } = params;
+  const { store, profileId, reason, agentDir } = params;
  const updated = await updateAuthProfileStoreWithLock({
    agentDir,
    updater: (freshStore) => {
      if (!freshStore.profiles[profileId]) return false;
-
      freshStore.usageStats = freshStore.usageStats ?? {};
      const existing = freshStore.usageStats[profileId] ?? {};
-      const errorCount = (existing.errorCount ?? 0) + 1;

-      // Exponential backoff: 1min, 5min, 25min, capped at 1h
-      const backoffMs = calculateAuthProfileCooldownMs(errorCount);
+      const nextErrorCount = (existing.errorCount ?? 0) + 1;
+      const failureCounts = { ...existing.failureCounts };
+      failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;

-      freshStore.usageStats[profileId] = {
+      const now = Date.now();
+      const updatedStats: ProfileUsageStats = {
        ...existing,
-        errorCount,
-        cooldownUntil: Date.now() + backoffMs,
+        errorCount: nextErrorCount,
+        failureCounts,
      };
+
+      if (reason === "billing") {
+        const billingCount = failureCounts.billing ?? 1;
+        const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
+        updatedStats.disabledUntil = now + backoffMs;
+        updatedStats.disabledReason = "billing";
+      } else {
+        const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
+        updatedStats.cooldownUntil = now + backoffMs;
+      }
+
+      freshStore.usageStats[profileId] = updatedStats;
      return true;
    },
  });
@@ -863,19 +920,48 @@ export async function markAuthProfileCooldown(params: {

  store.usageStats = store.usageStats ?? {};
  const existing = store.usageStats[profileId] ?? {};
-  const errorCount = (existing.errorCount ?? 0) + 1;
+  const nextErrorCount = (existing.errorCount ?? 0) + 1;
+  const failureCounts = { ...existing.failureCounts };
+  failureCounts[reason] = (failureCounts[reason] ?? 0) + 1;

-  // Exponential backoff: 1min, 5min, 25min, capped at 1h
-  const backoffMs = calculateAuthProfileCooldownMs(errorCount);
-
-  store.usageStats[profileId] = {
+  const now = Date.now();
+  const updatedStats: ProfileUsageStats = {
    ...existing,
-    errorCount,
-    cooldownUntil: Date.now() + backoffMs,
+    errorCount: nextErrorCount,
+    failureCounts,
  };
+  if (reason === "billing") {
+    const billingCount = failureCounts.billing ?? 1;
+    const backoffMs = calculateAuthProfileBillingDisableMs(billingCount);
+    updatedStats.disabledUntil = now + backoffMs;
+    updatedStats.disabledReason = "billing";
+  } else {
+    const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
+    updatedStats.cooldownUntil = now + backoffMs;
+  }
+
+  store.usageStats[profileId] = updatedStats;
  saveAuthProfileStore(store, agentDir);
 }

+/**
+ * Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
+ * Cooldown times: 1min, 5min, 25min, max 1 hour.
+ * Uses store lock to avoid overwriting concurrent usage updates.
+ */
+export async function markAuthProfileCooldown(params: {
+  store: AuthProfileStore;
+  profileId: string;
+  agentDir?: string;
+}): Promise<void> {
+  await markAuthProfileFailure({
+    store: params.store,
+    profileId: params.profileId,
+    reason: "unknown",
+    agentDir: params.agentDir,
+  });
+}
+
 /**
 * Clear cooldown for a profile (e.g., manual reset).
 * Uses store lock to avoid overwriting concurrent usage updates.
@@ -973,7 +1059,8 @@ export function resolveAuthProfileOrder(params: {
    const inCooldown: Array<{ profileId: string; cooldownUntil: number }> = [];

    for (const profileId of deduped) {
-      const cooldownUntil = store.usageStats?.[profileId]?.cooldownUntil;
+      const cooldownUntil =
+        resolveProfileUnusableUntil(store.usageStats?.[profileId] ?? {}) ?? 0;
      if (
        typeof cooldownUntil === "number" &&
        Number.isFinite(cooldownUntil) &&
@@ -1057,7 +1144,8 @@ function orderProfilesByMode(
  const cooldownSorted = inCooldown
    .map((profileId) => ({
      profileId,
-      cooldownUntil: store.usageStats?.[profileId]?.cooldownUntil ?? now,
+      cooldownUntil:
+        resolveProfileUnusableUntil(store.usageStats?.[profileId] ?? {}) ?? now,
    }))
    .sort((a, b) => a.cooldownUntil - b.cooldownUntil)
    .map((entry) => entry.profileId);
--- a/src/agents/model-fallback.test.ts
+++ b/src/agents/model-fallback.test.ts
@@ -56,6 +56,28 @@ describe("runWithModelFallback", () => {
    expect(run.mock.calls[1]?.[1]).toBe("claude-haiku-3-5");
  });

+  it("falls back on 402 payment required", async () => {
+    const cfg = makeCfg();
+    const run = vi
+      .fn()
+      .mockRejectedValueOnce(
+        Object.assign(new Error("payment required"), { status: 402 }),
+      )
+      .mockResolvedValueOnce("ok");
+
+    const result = await runWithModelFallback({
+      cfg,
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      run,
+    });
+
+    expect(result.result).toBe("ok");
+    expect(run).toHaveBeenCalledTimes(2);
+    expect(run.mock.calls[1]?.[0]).toBe("anthropic");
+    expect(run.mock.calls[1]?.[1]).toBe("claude-haiku-3-5");
+  });
+
  it("falls back on billing errors", async () => {
    const cfg = makeCfg();
    const run = vi
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -7,11 +7,7 @@ import {
  resolveConfiguredModelRef,
  resolveModelRefFromString,
 } from "./model-selection.js";
-import {
-  isAuthErrorMessage,
-  isBillingErrorMessage,
-  isRateLimitErrorMessage,
-} from "./pi-embedded-helpers.js";
+import { isFailoverErrorMessage } from "./pi-embedded-helpers.js";

 type ModelCandidate = {
  provider: string;
@@ -71,16 +67,6 @@ function getErrorMessage(err: unknown): string {
  return "";
 }

-function isTimeoutErrorMessage(raw: string): boolean {
-  const value = raw.toLowerCase();
-  return (
-    value.includes("timeout") ||
-    value.includes("timed out") ||
-    value.includes("deadline exceeded") ||
-    value.includes("context deadline exceeded")
-  );
-}
-
 function shouldFallbackForError(err: unknown): boolean {
  const statusCode = getStatusCode(err);
  if (statusCode && [401, 402, 403, 429].includes(statusCode)) return true;
@@ -94,12 +80,7 @@ function shouldFallbackForError(err: unknown): boolean {
  }
  const message = getErrorMessage(err);
  if (!message) return false;
-  return (
-    isAuthErrorMessage(message) ||
-    isRateLimitErrorMessage(message) ||
-    isBillingErrorMessage(message) ||
-    isTimeoutErrorMessage(message)
-  );
+  return isFailoverErrorMessage(message);
 }

 function buildAllowedModelKeys(
--- a/src/agents/pi-embedded-helpers.test.ts
+++ b/src/agents/pi-embedded-helpers.test.ts
@@ -3,9 +3,11 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
 import { describe, expect, it } from "vitest";
 import {
  buildBootstrapContextFiles,
+  classifyFailoverReason,
  formatAssistantErrorText,
  isBillingErrorMessage,
  isContextOverflowError,
+  isFailoverErrorMessage,
  isMessagingToolDuplicate,
  normalizeTextForComparison,
  sanitizeGoogleTurnOrdering,
@@ -238,6 +240,30 @@ describe("isBillingErrorMessage", () => {
  });
 });

+describe("isFailoverErrorMessage", () => {
+  it("matches auth/rate/billing/timeout", () => {
+    const samples = [
+      "invalid api key",
+      "429 rate limit exceeded",
+      "Your credit balance is too low",
+      "request timed out",
+    ];
+    for (const sample of samples) {
+      expect(isFailoverErrorMessage(sample)).toBe(true);
+    }
+  });
+});
+
+describe("classifyFailoverReason", () => {
+  it("returns a stable reason", () => {
+    expect(classifyFailoverReason("invalid api key")).toBe("auth");
+    expect(classifyFailoverReason("429 too many requests")).toBe("rate_limit");
+    expect(classifyFailoverReason("credit balance too low")).toBe("billing");
+    expect(classifyFailoverReason("deadline exceeded")).toBe("timeout");
+    expect(classifyFailoverReason("bad request")).toBeNull();
+  });
+});
+
 describe("formatAssistantErrorText", () => {
  const makeAssistantError = (errorMessage: string): AssistantMessage =>
    ({
--- a/src/agents/pi-embedded-helpers.ts
+++ b/src/agents/pi-embedded-helpers.ts
@@ -261,6 +261,17 @@ export function isRateLimitErrorMessage(raw: string): boolean {
  );
 }

+export function isTimeoutErrorMessage(raw: string): boolean {
+  const value = raw.toLowerCase();
+  if (!value) return false;
+  return (
+    value.includes("timeout") ||
+    value.includes("timed out") ||
+    value.includes("deadline exceeded") ||
+    value.includes("context deadline exceeded")
+  );
+}
+
 export function isBillingErrorMessage(raw: string): boolean {
  const value = raw.toLowerCase();
  if (!value) return false;
@@ -308,6 +319,32 @@ export function isAuthAssistantError(
  return isAuthErrorMessage(msg.errorMessage ?? "");
 }

+export type FailoverReason =
+  | "auth"
+  | "rate_limit"
+  | "billing"
+  | "timeout"
+  | "unknown";
+
+export function classifyFailoverReason(raw: string): FailoverReason | null {
+  if (isAuthErrorMessage(raw)) return "auth";
+  if (isRateLimitErrorMessage(raw)) return "rate_limit";
+  if (isBillingErrorMessage(raw)) return "billing";
+  if (isTimeoutErrorMessage(raw)) return "timeout";
+  return null;
+}
+
+export function isFailoverErrorMessage(raw: string): boolean {
+  return classifyFailoverReason(raw) !== null;
+}
+
+export function isFailoverAssistantError(
+  msg: AssistantMessage | undefined,
+): boolean {
+  if (!msg || msg.stopReason !== "error") return false;
+  return isFailoverErrorMessage(msg.errorMessage ?? "");
+}
+
 function extractSupportedValues(raw: string): string[] {
  const match =
    raw.match(/supported values are:\s*([^\n.]+)/i) ??
--- a/src/agents/pi-embedded-runner.ts
+++ b/src/agents/pi-embedded-runner.ts
@@ -37,7 +37,7 @@ import { normalizeMessageProvider } from "../utils/message-provider.js";
 import { resolveUserPath } from "../utils.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";
 import {
-  markAuthProfileCooldown,
+  markAuthProfileFailure,
  markAuthProfileGood,
  markAuthProfileUsed,
 } from "./auth-profiles.js";
@@ -55,17 +55,17 @@ import {
 import { ensureClawdbotModelsJson } from "./models-config.js";
 import {
  buildBootstrapContextFiles,
+  classifyFailoverReason,
  type EmbeddedContextFile,
  ensureSessionHeader,
  formatAssistantErrorText,
  isAuthAssistantError,
-  isAuthErrorMessage,
-  isBillingAssistantError,
-  isBillingErrorMessage,
  isContextOverflowError,
+  isFailoverAssistantError,
+  isFailoverErrorMessage,
  isGoogleModelApi,
  isRateLimitAssistantError,
-  isRateLimitErrorMessage,
+  isTimeoutErrorMessage,
  pickFallbackThinkingLevel,
  sanitizeGoogleTurnOrdering,
  sanitizeSessionMessagesImages,
@@ -1438,10 +1438,22 @@ export async function runEmbeddedPiAgent(params: {
                },
              };
            }
+            const promptFailoverReason = classifyFailoverReason(errorText);
            if (
-              (isAuthErrorMessage(errorText) ||
-                isRateLimitErrorMessage(errorText) ||
-                isBillingErrorMessage(errorText)) &&
+              promptFailoverReason &&
+              promptFailoverReason !== "timeout" &&
+              lastProfileId
+            ) {
+              await markAuthProfileFailure({
+                store: authStore,
+                profileId: lastProfileId,
+                reason: promptFailoverReason,
+                agentDir: params.agentDir,
+              });
+            }
+            if (
+              isFailoverErrorMessage(errorText) &&
+              promptFailoverReason !== "timeout" &&
              (await advanceAuthProfile())
            ) {
              continue;
@@ -1484,19 +1496,26 @@ export async function runEmbeddedPiAgent(params: {
            0;
          const authFailure = isAuthAssistantError(lastAssistant);
          const rateLimitFailure = isRateLimitAssistantError(lastAssistant);
-          const billingFailure = isBillingAssistantError(lastAssistant);
+          const failoverFailure = isFailoverAssistantError(lastAssistant);
+          const assistantFailoverReason = classifyFailoverReason(
+            lastAssistant?.errorMessage ?? "",
+          );

          // Treat timeout as potential rate limit (Antigravity hangs on rate limit)
-          const shouldRotate =
-            (!aborted && (authFailure || rateLimitFailure || billingFailure)) ||
-            timedOut;
+          const shouldRotate = (!aborted && failoverFailure) || timedOut;

          if (shouldRotate) {
            // Mark current profile for cooldown before rotating
            if (lastProfileId) {
-              await markAuthProfileCooldown({
+              const reason =
+                timedOut || assistantFailoverReason === "timeout"
+                  ? "timeout"
+                  : (assistantFailoverReason ?? "unknown");
+              await markAuthProfileFailure({
                store: authStore,
                profileId: lastProfileId,
+                reason,
+                agentDir: params.agentDir,
              });
              if (timedOut) {
                log.warn(
@@ -1518,10 +1537,25 @@ export async function runEmbeddedPiAgent(params: {
                  ? "LLM request timed out."
                  : rateLimitFailure
                    ? "LLM request rate limited."
-                    : billingFailure
-                      ? "LLM request payment required."
-                      : "LLM request unauthorized.");
-              throw new Error(message);
+                    : authFailure
+                      ? "LLM request unauthorized."
+                      : "LLM request failed.");
+              const err = new Error(message);
+              (err as { failoverReason?: string }).failoverReason =
+                assistantFailoverReason ?? undefined;
+              if (assistantFailoverReason === "billing") {
+                (err as { status?: number }).status = 402;
+              } else if (assistantFailoverReason === "rate_limit") {
+                (err as { status?: number }).status = 429;
+              } else if (assistantFailoverReason === "auth") {
+                (err as { status?: number }).status = 401;
+              } else if (
+                assistantFailoverReason === "timeout" ||
+                isTimeoutErrorMessage(message)
+              ) {
+                (err as { status?: number }).status = 408;
+              }
+              throw err;
            }
          }