From aa30995aa18628f77adb22b4394e1fffe19c0398 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 10 Jan 2026 21:16:54 +0000 Subject: [PATCH] test(live): add provider filters + google skip rules --- docs/testing.md | 10 ++++++ patches/@mariozechner__pi-ai@0.42.2.patch | 4 +-- src/agents/models.profiles.live.test.ts | 34 +++++++++++++++++++ .../gateway-models.profiles.live.test.ts | 22 +++++++++++- 4 files changed, 67 insertions(+), 3 deletions(-) diff --git a/docs/testing.md b/docs/testing.md index 724529bff..fdebd61c7 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -102,6 +102,8 @@ Live tests are split into two layers so we can isolate failures: - How to select models: - `CLAWDBOT_LIVE_MODELS=all` to run everything with keys - or `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,..."` (comma allowlist) +- How to select providers: + - `CLAWDBOT_LIVE_PROVIDERS="google,google-antigravity,google-gemini-cli"` (comma allowlist) - Where keys come from: - By default: profile store and env fallbacks - Set `CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS=1` to enforce **profile store** only @@ -126,11 +128,19 @@ Live tests are split into two layers so we can isolate failures: - How to select models: - `CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1` to scan all discovered models with keys - or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model,provider/model,..."` to narrow quickly +- How to select providers (avoid “OpenRouter everything”): + - `CLAWDBOT_LIVE_GATEWAY_PROVIDERS="google,google-antigravity,google-gemini-cli,openai,anthropic,zai,minimax"` (comma allowlist) - Optional tool-calling stress: - `CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1` enables an extra “bash writes file → read reads it back → echo nonce” check. - This is specifically meant to catch tool-calling compatibility issues across providers (formatting, history replay, tool_result pairing, etc.). - Optional image send smoke: - `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` sends a real image attachment through the gateway agent pipeline (multimodal message) and asserts the model can read back a per-run code from the image. + - Flow (high level): + - Test generates a tiny PNG with “CAT” + random code (`src/gateway/live-image-probe.ts`) + - Sends it via `agent` `attachments: [{ mimeType: "image/png", content: "" }]` + - Gateway parses attachments into `images[]` (`src/gateway/server-methods/agent.ts` + `src/gateway/chat-attachments.ts`) + - Embedded agent forwards a multimodal user message to the model + - Assertion: reply contains `cat` + the code (OCR tolerance: minor mistakes allowed) ### Recommended live recipes diff --git a/patches/@mariozechner__pi-ai@0.42.2.patch b/patches/@mariozechner__pi-ai@0.42.2.patch index 956de72a0..d87422aad 100644 --- a/patches/@mariozechner__pi-ai@0.42.2.patch +++ b/patches/@mariozechner__pi-ai@0.42.2.patch @@ -7,7 +7,7 @@ index 93aa26c395e9bd0df64376408a13d15ee9e7cce7..beb585e2f2c13eec3bca98acade76110 } const errorText = await response.text(); + // Fail immediately on 429 for Antigravity to let callers rotate accounts. -+ // Antigravity rate limits can have very long retry delays (10+ minutes). ++ // Antigravity rate limits can have very long retry delays (10+ minutes). Repro: LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_PROVIDERS=\"google-antigravity\" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts + if (isAntigravity && response.status === 429) { + throw new Error(`Cloud Code Assist API error (${response.status}): ${errorText}`); + } @@ -56,7 +56,7 @@ index 188a8294f26fe1bfe3fb298a7f58e4d8eaf2a529..a3aeb6a7ff53bc4f7f44362adb950b2c description: tool.description, parameters: tool.parameters, - strict: null, -+ strict: false, ++ strict: false, // Repro: LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_MODELS=\"openai-codex/gpt-5.2\" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts })); } function mapStopReason(status) { diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index 9e5ec9668..ee7b1c6c4 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -19,6 +19,16 @@ const REQUIRE_PROFILE_KEYS = const describeLive = LIVE && ALL_MODELS ? describe : describe.skip; +function parseProviderFilter(raw?: string): Set | null { + const trimmed = raw?.trim(); + if (!trimmed || trimmed === "all") return null; + const ids = trimmed + .split(",") + .map((s) => s.trim()) + .filter(Boolean); + return ids.length ? new Set(ids) : null; +} + function parseModelFilter(raw?: string): Set | null { const trimmed = raw?.trim(); if (!trimmed || trimmed === "all") return null; @@ -29,6 +39,15 @@ function parseModelFilter(raw?: string): Set | null { return ids.length ? new Set(ids) : null; } +function isGoogleModelNotFoundError(err: unknown): boolean { + const msg = String(err); + if (!/not found/i.test(msg)) return false; + if (/models\/.+ is not found for api version/i.test(msg)) return true; + if (/"status"\\s*:\\s*"NOT_FOUND"/.test(msg)) return true; + if (/"code"\\s*:\\s*404/.test(msg)) return true; + return false; +} + describeLive("live models (profile keys)", () => { it( "completes across configured models", @@ -42,11 +61,15 @@ describeLive("live models (profile keys)", () => { const models = modelRegistry.getAll() as Array>; const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS); + const providers = parseProviderFilter( + process.env.CLAWDBOT_LIVE_PROVIDERS, + ); const failures: Array<{ model: string; error: string }> = []; const skipped: Array<{ model: string; reason: string }> = []; for (const model of models) { + if (providers && !providers.has(model.provider)) continue; const id = `${model.provider}/${model.id}`; if (filter && !filter.has(id)) continue; @@ -168,8 +191,19 @@ describeLive("live models (profile keys)", () => { .filter((block) => block.type === "text") .map((block) => block.text.trim()) .join(" "); + if (text.length === 0 && model.provider === "google") { + skipped.push({ + model: id, + reason: "no text returned (likely unavailable model id)", + }); + continue; + } expect(text.length).toBeGreaterThan(0); } catch (err) { + if (model.provider === "google" && isGoogleModelNotFoundError(err)) { + skipped.push({ model: id, reason: String(err) }); + continue; + } failures.push({ model: id, error: String(err) }); } } diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index 4c5fb6960..6bb4cc740 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -27,6 +27,7 @@ const ALL_MODELS = const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1"; const EXTRA_IMAGE_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1"; +const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS); const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip; @@ -63,6 +64,16 @@ function isMeaningful(text: string): boolean { return true; } +function isGoogleModelNotFoundText(text: string): boolean { + const trimmed = text.trim(); + if (!trimmed) return false; + if (!/not found/i.test(trimmed)) return false; + if (/models\/.+ is not found for api version/i.test(trimmed)) return true; + if (/"status"\s*:\s*"NOT_FOUND"/.test(trimmed)) return true; + if (/"code"\s*:\s*404/.test(trimmed)) return true; + return false; +} + function randomImageProbeCode(len = 10): string { const alphabet = "2345689ABCEF"; const bytes = randomBytes(len); @@ -233,6 +244,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { const candidates: Array> = []; for (const model of wanted) { const id = `${model.provider}/${model.id}`; + if (PROVIDERS && !PROVIDERS.has(model.provider)) continue; if (filter && !filter.has(id)) continue; try { // eslint-disable-next-line no-await-in-loop @@ -345,6 +357,14 @@ describeLive("gateway live (dev agent, profile keys)", () => { throw new Error(`agent status=${String(payload?.status)}`); } const text = extractPayloadText(payload?.result); + if ( + model.provider === "google" && + isGoogleModelNotFoundText(text) + ) { + // Catalog drift: model IDs can disappear or become unavailable on the API. + // Treat as skip when scanning "all models" for Google. + continue; + } if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`); if ( !/\bmicro\s*-?\s*tasks?\b/i.test(text) || @@ -453,7 +473,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { if (Math.abs(cand.length - imageCode.length) > 2) return best; return Math.min(best, editDistance(cand, imageCode)); }, Number.POSITIVE_INFINITY); - if (!(bestDistance <= 1)) { + if (!(bestDistance <= 2)) { throw new Error( `image probe missing code (${imageCode}): ${imageText}`, );