From eb5f758f6b3beee6f76f79048678e3166f6652d4 Mon Sep 17 00:00:00 2001 From: Muhammed Mukhthar CM Date: Tue, 6 Jan 2026 22:44:19 +0000 Subject: [PATCH] fix(auth): improve multi-account round-robin rotation and 429 handling This commit fixes several issues with multi-account OAuth rotation that were causing slow responses and inefficient account cycling. ## Changes ### 1. Fix usageStats race condition (auth-profiles.ts) The `markAuthProfileUsed`, `markAuthProfileCooldown`, `markAuthProfileGood`, and `clearAuthProfileCooldown` functions were using a stale in-memory store passed as a parameter. Long-running sessions would overwrite usageStats updates from concurrent sessions when saving. **Fix:** Re-read the store from disk before each update to get fresh usageStats from other sessions, then merge the update. ### 2. Capture AbortError from waitForCompactionRetry (pi-embedded-runner.ts) When a request timed out, `session.abort()` was called which throws an `AbortError`. The code structure was: ```javascript try { await session.prompt(params.prompt); } catch (err) { promptError = err; // Catches AbortError here } await waitForCompactionRetry(); // But THIS also throws AbortError! ``` The second `AbortError` from `waitForCompactionRetry()` escaped and bypassed the rotation/fallback logic entirely. **Fix:** Wrap `waitForCompactionRetry()` in its own try/catch to capture the error as `promptError`, enabling proper timeout handling. Root cause analysis and fix proposed by @erikpr1994 in #313. Fixes #313 ### 3. Fail fast on 429 rate limits (pi-ai patch) The pi-ai library was retrying 429 errors up to 3 times with exponential backoff before throwing. This meant a rate-limited account would waste 30+ seconds retrying before our rotation code could try the next account. **Fix:** Patch google-gemini-cli.js to: - Throw immediately on first 429 (no retries) - Not catch and retry 429 errors in the network error handler This allows the caller to rotate to the next account instantly on rate limit. Note: We submitted this fix upstream (https://github.com/badlogic/pi-mono/pull/504) but it was closed without merging. Keeping as a local patch for now. ## Testing With 6 Antigravity accounts configured: - Accounts rotate properly based on lastUsed (round-robin) - 429s trigger immediate rotation to next account - usageStats persist correctly across concurrent sessions - Cooldown tracking works as expected ## Before/After **Before:** Multiple 429 retries on same account, 30-90s delays **After:** Instant rotation on 429, responses in seconds --- patches/@mariozechner__pi-ai.patch | 28 ++++++++++++++++ pnpm-lock.yaml | 10 +++--- src/agents/auth-profiles.ts | 53 ++++++++++++++++++------------ src/agents/pi-embedded-runner.ts | 7 +++- 4 files changed, 71 insertions(+), 27 deletions(-) diff --git a/patches/@mariozechner__pi-ai.patch b/patches/@mariozechner__pi-ai.patch index aa03fc55a..91a31a42b 100644 --- a/patches/@mariozechner__pi-ai.patch +++ b/patches/@mariozechner__pi-ai.patch @@ -213,3 +213,31 @@ index 20fb0a22aaa28f7ff7c2f44a8b628fa1d9d7d936..31bae0aface1319487ce62d35f1f3b6e })); } function mapStopReason(status) { +diff --git a/dist/providers/google-gemini-cli.js b/dist/providers/google-gemini-cli.js +--- a/dist/providers/google-gemini-cli.js ++++ b/dist/providers/google-gemini-cli.js +@@ -168,7 +168,12 @@ async function* streamCompletion(params, options) { + break; // Success, exit retry loop + } + const errorText = await response.text(); +- // Check if retryable ++ // PATCH: Fail immediately on 429 to let caller rotate accounts ++ if (response.status === 429) { ++ console.log(`[pi-ai] 429 rate limit - failing fast to rotate account`); ++ throw new Error(`Cloud Code Assist API error (${response.status}): ${errorText}`); ++ } ++ // Check if retryable (non-429 errors) + if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) { + // Use server-provided delay or exponential backoff + const serverDelay = extractRetryDelay(errorText); +@@ -183,6 +188,10 @@ async function* streamCompletion(params, options) { + if (error instanceof Error && error.message === "Request was aborted") { + throw error; + } ++ // PATCH: Don't retry 429 errors - let caller rotate accounts ++ if (error instanceof Error && error.message.includes("429")) { ++ throw error; ++ } + lastError = error instanceof Error ? error : new Error(String(error)); + // Network errors are retryable + if (attempt < MAX_RETRIES) { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f8e1e95a4..b4ea19aec 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,7 +9,7 @@ overrides: patchedDependencies: '@mariozechner/pi-ai': - hash: b49275c3e2023970d8248ababef6df60e093e58a3ba3127c2ba4de1df387d06a + hash: 31ccee9eb1fc5053d766ccbf3abc8340a5fb3bea1c52b491a385b53468b81819 path: patches/@mariozechner__pi-ai.patch playwright-core@1.57.0: hash: 66f1f266424dbe354068aaa5bba87bfb0e1d7d834a938c25dd70d43cdf1c1b02 @@ -39,7 +39,7 @@ importers: version: 0.37.2(ws@8.19.0)(zod@4.3.5) '@mariozechner/pi-ai': specifier: ^0.37.2 - version: 0.37.2(patch_hash=b49275c3e2023970d8248ababef6df60e093e58a3ba3127c2ba4de1df387d06a)(ws@8.19.0)(zod@4.3.5) + version: 0.37.2(patch_hash=31ccee9eb1fc5053d766ccbf3abc8340a5fb3bea1c52b491a385b53468b81819)(ws@8.19.0)(zod@4.3.5) '@mariozechner/pi-coding-agent': specifier: ^0.37.2 version: 0.37.2(ws@8.19.0)(zod@4.3.5) @@ -3570,7 +3570,7 @@ snapshots: '@mariozechner/pi-agent-core@0.37.2(ws@8.19.0)(zod@4.3.5)': dependencies: - '@mariozechner/pi-ai': 0.37.2(patch_hash=b49275c3e2023970d8248ababef6df60e093e58a3ba3127c2ba4de1df387d06a)(ws@8.19.0)(zod@4.3.5) + '@mariozechner/pi-ai': 0.37.2(patch_hash=31ccee9eb1fc5053d766ccbf3abc8340a5fb3bea1c52b491a385b53468b81819)(ws@8.19.0)(zod@4.3.5) '@mariozechner/pi-tui': 0.37.2 transitivePeerDependencies: - '@modelcontextprotocol/sdk' @@ -3580,7 +3580,7 @@ snapshots: - ws - zod - '@mariozechner/pi-ai@0.37.2(patch_hash=b49275c3e2023970d8248ababef6df60e093e58a3ba3127c2ba4de1df387d06a)(ws@8.19.0)(zod@4.3.5)': + '@mariozechner/pi-ai@0.37.2(patch_hash=31ccee9eb1fc5053d766ccbf3abc8340a5fb3bea1c52b491a385b53468b81819)(ws@8.19.0)(zod@4.3.5)': dependencies: '@anthropic-ai/sdk': 0.71.2(zod@4.3.5) '@google/genai': 1.34.0 @@ -3604,7 +3604,7 @@ snapshots: dependencies: '@crosscopy/clipboard': 0.2.8 '@mariozechner/pi-agent-core': 0.37.2(ws@8.19.0)(zod@4.3.5) - '@mariozechner/pi-ai': 0.37.2(patch_hash=b49275c3e2023970d8248ababef6df60e093e58a3ba3127c2ba4de1df387d06a)(ws@8.19.0)(zod@4.3.5) + '@mariozechner/pi-ai': 0.37.2(patch_hash=31ccee9eb1fc5053d766ccbf3abc8340a5fb3bea1c52b491a385b53468b81819)(ws@8.19.0)(zod@4.3.5) '@mariozechner/pi-tui': 0.37.2 chalk: 5.6.2 cli-highlight: 2.1.11 diff --git a/src/agents/auth-profiles.ts b/src/agents/auth-profiles.ts index 8b0b31866..07b8f44ba 100644 --- a/src/agents/auth-profiles.ts +++ b/src/agents/auth-profiles.ts @@ -355,23 +355,26 @@ export function isProfileInCooldown( /** * Mark a profile as successfully used. Resets error count and updates lastUsed. + * Re-reads the store from disk to avoid overwriting concurrent updates. */ export function markAuthProfileUsed(params: { store: AuthProfileStore; profileId: string; agentDir?: string; }): void { - const { store, profileId, agentDir } = params; - if (!store.profiles[profileId]) return; + const { profileId, agentDir } = params; + // Re-read from disk to get fresh usageStats from other sessions + const freshStore = ensureAuthProfileStore(agentDir); + if (!freshStore.profiles[profileId]) return; - store.usageStats = store.usageStats ?? {}; - store.usageStats[profileId] = { - ...store.usageStats[profileId], + freshStore.usageStats = freshStore.usageStats ?? {}; + freshStore.usageStats[profileId] = { + ...freshStore.usageStats[profileId], lastUsed: Date.now(), errorCount: 0, cooldownUntil: undefined, }; - saveAuthProfileStore(store, agentDir); + saveAuthProfileStore(freshStore, agentDir); } export function calculateAuthProfileCooldownMs(errorCount: number): number { @@ -385,47 +388,53 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number { /** * Mark a profile as failed/rate-limited. Applies exponential backoff cooldown. * Cooldown times: 1min, 5min, 25min, max 1 hour. + * Re-reads the store from disk to avoid overwriting concurrent updates. */ export function markAuthProfileCooldown(params: { store: AuthProfileStore; profileId: string; agentDir?: string; }): void { - const { store, profileId, agentDir } = params; - if (!store.profiles[profileId]) return; + const { profileId, agentDir } = params; + // Re-read from disk to get fresh usageStats from other sessions + const freshStore = ensureAuthProfileStore(agentDir); + if (!freshStore.profiles[profileId]) return; - store.usageStats = store.usageStats ?? {}; - const existing = store.usageStats[profileId] ?? {}; + freshStore.usageStats = freshStore.usageStats ?? {}; + const existing = freshStore.usageStats[profileId] ?? {}; const errorCount = (existing.errorCount ?? 0) + 1; // Exponential backoff: 1min, 5min, 25min, capped at 1h const backoffMs = calculateAuthProfileCooldownMs(errorCount); - store.usageStats[profileId] = { + freshStore.usageStats[profileId] = { ...existing, errorCount, cooldownUntil: Date.now() + backoffMs, }; - saveAuthProfileStore(store, agentDir); + saveAuthProfileStore(freshStore, agentDir); } /** * Clear cooldown for a profile (e.g., manual reset). + * Re-reads the store from disk to avoid overwriting concurrent updates. */ export function clearAuthProfileCooldown(params: { store: AuthProfileStore; profileId: string; agentDir?: string; }): void { - const { store, profileId, agentDir } = params; - if (!store.usageStats?.[profileId]) return; + const { profileId, agentDir } = params; + // Re-read from disk to get fresh usageStats from other sessions + const freshStore = ensureAuthProfileStore(agentDir); + if (!freshStore.usageStats?.[profileId]) return; - store.usageStats[profileId] = { - ...store.usageStats[profileId], + freshStore.usageStats[profileId] = { + ...freshStore.usageStats[profileId], errorCount: 0, cooldownUntil: undefined, }; - saveAuthProfileStore(store, agentDir); + saveAuthProfileStore(freshStore, agentDir); } export function resolveAuthProfileOrder(params: { @@ -591,11 +600,13 @@ export function markAuthProfileGood(params: { profileId: string; agentDir?: string; }): void { - const { store, provider, profileId, agentDir } = params; - const profile = store.profiles[profileId]; + const { provider, profileId, agentDir } = params; + // Re-read from disk to avoid overwriting concurrent updates + const freshStore = ensureAuthProfileStore(agentDir); + const profile = freshStore.profiles[profileId]; if (!profile || profile.provider !== provider) return; - store.lastGood = { ...store.lastGood, [provider]: profileId }; - saveAuthProfileStore(store, agentDir); + freshStore.lastGood = { ...freshStore.lastGood, [provider]: profileId }; + saveAuthProfileStore(freshStore, agentDir); } export function resolveAuthStorePathForDisplay(): string { diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index a80b6a982..3339a6031 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -909,7 +909,12 @@ export async function runEmbeddedPiAgent(params: { `embedded run prompt end: runId=${params.runId} sessionId=${params.sessionId} durationMs=${Date.now() - promptStartedAt}`, ); } - await waitForCompactionRetry(); + try { + await waitForCompactionRetry(); + } catch (err) { + // Capture AbortError from waitForCompactionRetry to enable fallback/rotation + if (!promptError) promptError = err; + } messagesSnapshot = session.messages.slice(); sessionIdUsed = session.sessionId; } finally {