diff --git a/docs/concepts/model-providers.md b/docs/concepts/model-providers.md index 5b2c7b731..9ed6b713a 100644 --- a/docs/concepts/model-providers.md +++ b/docs/concepts/model-providers.md @@ -76,7 +76,7 @@ Clawdbot ships with the pi‑ai catalog. These providers require **no** - Provider: `google` - Auth: `GEMINI_API_KEY` -- Example model: `google/gemini-3-pro` +- Example model: `google/gemini-3-pro-preview` - CLI: `clawdbot onboard --auth-choice gemini-api-key` ### Google Vertex / Antigravity / Gemini CLI diff --git a/docs/testing.md b/docs/testing.md index ecd636cb5..832a69f86 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -25,9 +25,8 @@ When you touch tests or want extra confidence: - Coverage gate: `pnpm test:coverage` - E2E suite: `pnpm test:e2e` -When debugging real providers/models (requires real creds; skipped by default): -- Live suite (models only): `CLAWDBOT_LIVE_TEST=1 pnpm test:live` -- Live suite (models + providers): `LIVE=1 pnpm test:live` +When debugging real providers/models (requires real creds): +- Live suite (models + gateway tool/image probes): `pnpm test:live` Tip: when you only need one failing case, prefer narrowing live tests via the allowlist env vars described below. @@ -67,7 +66,7 @@ Think of the suites as “increasing realism” (and increasing flakiness/cost): - Command: `pnpm test:live` - Config: `vitest.live.config.ts` - Files: `src/**/*.live.test.ts` -- Default: **skipped** unless `CLAWDBOT_LIVE_TEST=1` or `LIVE=1` +- Default: **enabled** by `pnpm test:live` (sets `CLAWDBOT_LIVE_TEST=1`) - Scope: - “Does this provider/model actually work *today* with real creds?” - Catch provider format changes, tool-calling quirks, auth issues, and rate limit behavior @@ -75,6 +74,8 @@ Think of the suites as “increasing realism” (and increasing flakiness/cost): - Not CI-stable by design (real networks, real provider policies, quotas, outages) - Costs money / uses rate limits - Prefer running narrowed subsets instead of “everything” + - Live runs will source `~/.profile` to pick up missing API keys + - Anthropic key rotation: set `CLAWDBOT_LIVE_ANTHROPIC_KEYS="sk-...,sk-..."` (or `CLAWDBOT_LIVE_ANTHROPIC_KEY=sk-...`) or multiple `ANTHROPIC_API_KEY*` vars; tests will retry on rate limits ## Which suite should I run? @@ -97,10 +98,11 @@ Live tests are split into two layers so we can isolate failures: - Use `getApiKeyForModel` to select models you have creds for - Run a small completion per model (and targeted regressions where needed) - How to enable: - - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1` - - `CLAWDBOT_LIVE_ALL_MODELS=1` (required for this test to run) + - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly) +- Set `CLAWDBOT_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke - How to select models: - - `CLAWDBOT_LIVE_MODELS=all` to run everything with keys + - `CLAWDBOT_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet/Haiku 4.5, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.1, Grok 4) + - `CLAWDBOT_LIVE_MODELS=all` is an alias for the modern allowlist - or `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,..."` (comma allowlist) - How to select providers: - `CLAWDBOT_LIVE_PROVIDERS="google,google-antigravity,google-gemini-cli"` (comma allowlist) @@ -128,18 +130,16 @@ Live tests are split into two layers so we can isolate failures: - image probe: the test attaches a generated PNG (cat + randomized code) and expects the model to return `cat `. - Implementation reference: `src/gateway/gateway-models.profiles.live.test.ts` and `src/gateway/live-image-probe.ts`. - How to enable: - - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1` - - `CLAWDBOT_LIVE_GATEWAY=1` (required for this test to run) + - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly) - How to select models: - - `CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1` to scan all discovered models with keys - - or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model,provider/model,..."` to narrow quickly + - Default: modern allowlist (Opus/Sonnet/Haiku 4.5, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.1, Grok 4) + - `CLAWDBOT_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist + - Or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow - How to select providers (avoid “OpenRouter everything”): - `CLAWDBOT_LIVE_GATEWAY_PROVIDERS="google,google-antigravity,google-gemini-cli,openai,anthropic,zai,minimax"` (comma allowlist) -- Optional tool-calling stress: - - `CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1` enables an extra “exec writes file → read reads it back → echo nonce” check. - - This is specifically meant to catch tool-calling compatibility issues across providers (formatting, history replay, tool_result pairing, etc.). -- Optional image send smoke: - - `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` sends a real image attachment through the gateway agent pipeline (multimodal message) and asserts the model can read back a per-run code from the image. +- Tool + image probes are always on in this live test: + - `read` probe + `exec+read` probe (tool stress) + - image probe runs when the model advertises image input support - Flow (high level): - Test generates a tiny PNG with “CAT” + random code (`src/gateway/live-image-probe.ts`) - Sends it via `agent` `attachments: [{ mimeType: "image/png", content: "" }]` @@ -159,7 +159,7 @@ pnpm clawdbot models list --json - Test: `src/agents/anthropic.setup-token.live.test.ts` - Goal: verify Claude CLI setup-token (or a pasted setup-token profile) can complete an Anthropic prompt. - Enable: - - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1` + - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly) - `CLAWDBOT_LIVE_SETUP_TOKEN=1` - Token sources (pick one): - Profile: `CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test` @@ -171,7 +171,7 @@ Setup example: ```bash clawdbot models auth paste-token --provider anthropic --profile-id anthropic:setup-token-test -CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test pnpm test:live src/agents/anthropic.setup-token.live.test.ts +CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test pnpm test:live src/agents/anthropic.setup-token.live.test.ts ``` ## Live: CLI backend smoke (Claude CLI or other local CLIs) @@ -179,7 +179,7 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFI - Test: `src/gateway/gateway-cli-backend.live.test.ts` - Goal: validate the Gateway + agent pipeline using a local CLI backend, without touching your default config. - Enable: - - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1` + - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly) - `CLAWDBOT_LIVE_CLI_BACKEND=1` - Defaults: - Model: `claude-cli/claude-sonnet-4-5` @@ -200,7 +200,7 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFI Example: ```bash -CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_CLI_BACKEND=1 \ +CLAWDBOT_LIVE_CLI_BACKEND=1 \ CLAWDBOT_LIVE_CLI_BACKEND_MODEL="claude-cli/claude-sonnet-4-5" \ pnpm test:live src/gateway/gateway-cli-backend.live.test.ts ``` @@ -210,17 +210,17 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_CLI_BACKEND=1 \ Narrow, explicit allowlists are fastest and least flaky: - Single model, direct (no gateway): - - `CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_ALL_MODELS=1 CLAWDBOT_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts` + - `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts` - Single model, gateway smoke: - - `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` + - `CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` -- Tool calling across several providers (exec + read probe): - - `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` +- Tool calling across several providers: + - `CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-flash-preview,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` - Google focus (Gemini API key + Antigravity): - - Gemini (API key): `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="google/gemini-3-flash" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` - - Antigravity (OAuth): `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-pro-high" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` + - Gemini (API key): `CLAWDBOT_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` + - Antigravity (OAuth): `CLAWDBOT_LIVE_GATEWAY_MODELS="google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-pro-high" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` Notes: - `google/...` uses the Gemini API (API key). @@ -240,20 +240,20 @@ This is the “common models” run we expect to keep working: - OpenAI (non-Codex): `openai/gpt-5.2` (optional: `openai/gpt-5.1`) - OpenAI Codex: `openai-codex/gpt-5.2` (optional: `openai-codex/gpt-5.2-codex`) - Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`) -- Google (Gemini API): `google/gemini-3-pro` and `google/gemini-3-flash` (avoid older Gemini 2.x models) +- Google (Gemini API): `google/gemini-3-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models) - Google (Antigravity): `google-antigravity/claude-opus-4-5-thinking` and `google-antigravity/gemini-3-flash` - Z.AI (GLM): `zai/glm-4.7` - MiniMax: `minimax/minimax-m2.1` Run gateway smoke with tools + image: -`LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-pro,google/gemini-3-flash,google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` +`CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts` ### Baseline: tool calling (Read + optional Exec) Pick at least one per provider family: - OpenAI: `openai/gpt-5.2` (or `openai/gpt-5-mini`) - Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`) -- Google: `google/gemini-3-flash` (or `google/gemini-3-pro`) +- Google: `google/gemini-3-flash-preview` (or `google/gemini-3-pro-preview`) - Z.AI (GLM): `zai/glm-4.7` - MiniMax: `minimax/minimax-m2.1` @@ -265,7 +265,7 @@ Optional additional coverage (nice to have): ### Vision: image send (attachment → multimodal message) -Run with `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` and include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.). +Include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.) to exercise the image probe. ### Aggregators / alternate gateways diff --git a/package.json b/package.json index a9a7bdf2a..dc8271191 100644 --- a/package.json +++ b/package.json @@ -97,7 +97,7 @@ "test:force": "tsx scripts/test-force.ts", "test:coverage": "vitest run --coverage", "test:e2e": "vitest run --config vitest.e2e.config.ts", - "test:live": "vitest run --config vitest.live.config.ts", + "test:live": "CLAWDBOT_LIVE_TEST=1 vitest run --config vitest.live.config.ts", "test:docker:onboard": "bash scripts/e2e/onboard-docker.sh", "test:docker:gateway-network": "bash scripts/e2e/gateway-network-docker.sh", "test:docker:live-models": "bash scripts/test-live-models-docker.sh", diff --git a/scripts/test-live-gateway-models-docker.sh b/scripts/test-live-gateway-models-docker.sh index 40a4c065b..c31e356e5 100755 --- a/scripts/test-live-gateway-models-docker.sh +++ b/scripts/test-live-gateway-models-docker.sh @@ -20,10 +20,6 @@ docker run --rm -t \ --entrypoint bash \ -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \ -e HOME=/home/node \ - -e CLAWDBOT_LIVE_TEST=1 \ - -e CLAWDBOT_LIVE_GATEWAY=1 \ - -e CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 \ - -e CLAWDBOT_LIVE_GATEWAY_MODELS="${CLAWDBOT_LIVE_GATEWAY_MODELS:-all}" \ -v "$CONFIG_DIR":/home/node/.clawdbot \ -v "$WORKSPACE_DIR":/home/node/clawd \ "${PROFILE_MOUNT[@]}" \ diff --git a/src/agents/live-auth-keys.ts b/src/agents/live-auth-keys.ts new file mode 100644 index 000000000..81836b0a8 --- /dev/null +++ b/src/agents/live-auth-keys.ts @@ -0,0 +1,50 @@ +const KEY_SPLIT_RE = /[\s,;]+/g; + +function parseKeyList(raw?: string | null): string[] { + if (!raw) return []; + return raw + .split(KEY_SPLIT_RE) + .map((value) => value.trim()) + .filter(Boolean); +} + +function collectEnvPrefixedKeys(prefix: string): string[] { + const keys: string[] = []; + for (const [name, value] of Object.entries(process.env)) { + if (!name.startsWith(prefix)) continue; + const trimmed = value?.trim(); + if (!trimmed) continue; + keys.push(trimmed); + } + return keys; +} + +export function collectAnthropicApiKeys(): string[] { + const forcedSingle = process.env.CLAWDBOT_LIVE_ANTHROPIC_KEY?.trim(); + if (forcedSingle) return [forcedSingle]; + + const fromList = parseKeyList(process.env.CLAWDBOT_LIVE_ANTHROPIC_KEYS); + const fromEnv = collectEnvPrefixedKeys("ANTHROPIC_API_KEY"); + const primary = process.env.ANTHROPIC_API_KEY?.trim(); + + const seen = new Set(); + const add = (value?: string) => { + if (!value) return; + if (seen.has(value)) return; + seen.add(value); + }; + + for (const value of fromList) add(value); + if (primary) add(primary); + for (const value of fromEnv) add(value); + + return Array.from(seen); +} + +export function isAnthropicRateLimitError(message: string): boolean { + const lower = message.toLowerCase(); + if (lower.includes("rate_limit")) return true; + if (lower.includes("rate limit")) return true; + if (lower.includes("429")) return true; + return false; +} diff --git a/src/agents/live-model-filter.ts b/src/agents/live-model-filter.ts new file mode 100644 index 000000000..3ea349c59 --- /dev/null +++ b/src/agents/live-model-filter.ts @@ -0,0 +1,89 @@ +export type ModelRef = { + provider?: string | null; + id?: string | null; +}; + +const ANTHROPIC_PREFIXES = [ + "claude-opus-4-5", + "claude-sonnet-4-5", + "claude-haiku-4-5", +]; +const OPENAI_MODELS = ["gpt-5.2", "gpt-5.0"]; +const CODEX_MODELS = [ + "gpt-5.2", + "gpt-5.2-codex", + "gpt-5.1-codex", + "gpt-5.1-codex-mini", + "gpt-5.1-codex-max", +]; +const GOOGLE_PREFIXES = ["gemini-3"]; +const ZAI_PREFIXES = ["glm-4.7"]; +const MINIMAX_PREFIXES = ["minimax-m2.1"]; +const XAI_PREFIXES = ["grok-4"]; + +function matchesPrefix(id: string, prefixes: string[]): boolean { + return prefixes.some((prefix) => id.startsWith(prefix)); +} + +function matchesExactOrPrefix(id: string, values: string[]): boolean { + return values.some((value) => id === value || id.startsWith(value)); +} + +function matchesAny(id: string, values: string[]): boolean { + return values.some((value) => id.includes(value)); +} + +export function isModernModelRef(ref: ModelRef): boolean { + const provider = ref.provider?.trim().toLowerCase() ?? ""; + const id = ref.id?.trim().toLowerCase() ?? ""; + if (!provider || !id) return false; + + if (provider === "anthropic") { + return matchesPrefix(id, ANTHROPIC_PREFIXES); + } + + if (provider === "openai") { + return matchesExactOrPrefix(id, OPENAI_MODELS); + } + + if (provider === "openai-codex") { + return matchesExactOrPrefix(id, CODEX_MODELS); + } + + if (provider === "google" || provider === "google-gemini-cli") { + return matchesPrefix(id, GOOGLE_PREFIXES); + } + + if (provider === "google-antigravity") { + return ( + matchesPrefix(id, GOOGLE_PREFIXES) || + matchesPrefix(id, ANTHROPIC_PREFIXES) + ); + } + + if (provider === "zai") { + return matchesPrefix(id, ZAI_PREFIXES); + } + + if (provider === "minimax") { + return matchesPrefix(id, MINIMAX_PREFIXES); + } + + if (provider === "xai") { + return matchesPrefix(id, XAI_PREFIXES); + } + + if (provider === "openrouter" || provider === "opencode") { + return matchesAny(id, [ + ...ANTHROPIC_PREFIXES, + ...OPENAI_MODELS, + ...CODEX_MODELS, + ...GOOGLE_PREFIXES, + ...ZAI_PREFIXES, + ...MINIMAX_PREFIXES, + ...XAI_PREFIXES, + ]); + } + + return false; +} diff --git a/src/agents/models-config.test.ts b/src/agents/models-config.test.ts index 364d8e066..78053f869 100644 --- a/src/agents/models-config.test.ts +++ b/src/agents/models-config.test.ts @@ -117,4 +117,59 @@ describe("models config", () => { ); }); }); + + it("normalizes gemini 3 ids to preview for google providers", async () => { + await withTempHome(async () => { + vi.resetModules(); + const { ensureClawdbotModelsJson } = await import("./models-config.js"); + const { resolveClawdbotAgentDir } = await import("./agent-paths.js"); + + const cfg: ClawdbotConfig = { + models: { + providers: { + google: { + baseUrl: "https://generativelanguage.googleapis.com/v1beta", + apiKey: "GEMINI_KEY", + api: "google-generative-ai", + models: [ + { + id: "gemini-3-pro", + name: "Gemini 3 Pro", + api: "google-generative-ai", + reasoning: true, + input: ["text", "image"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 1048576, + maxTokens: 65536, + }, + { + id: "gemini-3-flash", + name: "Gemini 3 Flash", + api: "google-generative-ai", + reasoning: false, + input: ["text", "image"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 1048576, + maxTokens: 65536, + }, + ], + }, + }, + }, + }; + + await ensureClawdbotModelsJson(cfg); + + const modelPath = path.join(resolveClawdbotAgentDir(), "models.json"); + const raw = await fs.readFile(modelPath, "utf8"); + const parsed = JSON.parse(raw) as { + providers: Record }>; + }; + const ids = parsed.providers.google?.models?.map((model) => model.id); + expect(ids).toEqual([ + "gemini-3-pro-preview", + "gemini-3-flash-preview", + ]); + }); + }); }); diff --git a/src/agents/models-config.ts b/src/agents/models-config.ts index 2887554f5..d3fe04078 100644 --- a/src/agents/models-config.ts +++ b/src/agents/models-config.ts @@ -5,6 +5,7 @@ import { type ClawdbotConfig, loadConfig } from "../config/config.js"; import { resolveClawdbotAgentDir } from "./agent-paths.js"; type ModelsConfig = NonNullable; +type ProviderConfig = NonNullable[string]; const DEFAULT_MODE: NonNullable = "merge"; @@ -12,6 +13,38 @@ function isRecord(value: unknown): value is Record { return Boolean(value && typeof value === "object" && !Array.isArray(value)); } +function normalizeGoogleModelId(id: string): string { + if (id === "gemini-3-pro") return "gemini-3-pro-preview"; + if (id === "gemini-3-flash") return "gemini-3-flash-preview"; + return id; +} + +function normalizeGoogleProvider(provider: ProviderConfig): ProviderConfig { + let mutated = false; + const models = provider.models.map((model) => { + const nextId = normalizeGoogleModelId(model.id); + if (nextId === model.id) return model; + mutated = true; + return { ...model, id: nextId }; + }); + return mutated ? { ...provider, models } : provider; +} + +function normalizeProviders( + providers: ModelsConfig["providers"], +): ModelsConfig["providers"] { + if (!providers) return providers; + let mutated = false; + const next: Record = {}; + for (const [key, provider] of Object.entries(providers)) { + const normalized = + key === "google" ? normalizeGoogleProvider(provider) : provider; + if (normalized !== provider) mutated = true; + next[key] = normalized; + } + return mutated ? next : providers; +} + async function readJson(pathname: string): Promise { try { const raw = await fs.readFile(pathname, "utf8"); @@ -53,7 +86,8 @@ export async function ensureClawdbotModelsJson( } } - const next = `${JSON.stringify({ providers: mergedProviders }, null, 2)}\n`; + const normalizedProviders = normalizeProviders(mergedProviders); + const next = `${JSON.stringify({ providers: normalizedProviders }, null, 2)}\n`; try { existingRaw = await fs.readFile(targetPath, "utf8"); } catch { diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index d593773d0..56b6489dc 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -7,24 +7,20 @@ import { Type } from "@sinclair/typebox"; import { describe, expect, it } from "vitest"; import { loadConfig } from "../config/config.js"; import { resolveClawdbotAgentDir } from "./agent-paths.js"; -import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js"; -import { getApiKeyForModel } from "./model-auth.js"; import { - buildModelAliasIndex, - parseModelRef, - resolveConfiguredModelRef, - resolveModelRefFromString, -} from "./model-selection.js"; + collectAnthropicApiKeys, + isAnthropicRateLimitError, +} from "./live-auth-keys.js"; +import { isModernModelRef } from "./live-model-filter.js"; +import { getApiKeyForModel } from "./model-auth.js"; import { ensureClawdbotModelsJson } from "./models-config.js"; const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1"; -const ALL_MODELS = - process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" || - process.env.CLAWDBOT_LIVE_MODELS === "all"; +const DIRECT_ENABLED = Boolean(process.env.CLAWDBOT_LIVE_MODELS?.trim()); const REQUIRE_PROFILE_KEYS = process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1"; -const describeLive = LIVE && ALL_MODELS ? describe : describe.skip; +const describeLive = LIVE ? describe : describe.skip; function parseProviderFilter(raw?: string): Set | null { const trimmed = raw?.trim(); @@ -46,6 +42,10 @@ function parseModelFilter(raw?: string): Set | null { return ids.length ? new Set(ids) : null; } +function logProgress(message: string): void { + console.log(`[live] ${message}`); +} + function isGoogleModelNotFoundError(err: unknown): boolean { const msg = String(err); if (!/not found/i.test(msg)) return false; @@ -127,75 +127,25 @@ async function completeOkWithRetry(params: { return await runOnce(); } -function resolveConfiguredModelKeys( - cfg: ReturnType, -): string[] { - const aliasIndex = buildModelAliasIndex({ - cfg, - defaultProvider: DEFAULT_PROVIDER, - }); - const order: string[] = []; - const seen = new Set(); - - const addKey = (key: string) => { - const normalized = key.trim(); - if (!normalized || seen.has(normalized)) return; - seen.add(normalized); - order.push(normalized); - }; - - const addRef = (ref: { provider: string; model: string }) => { - addKey(`${ref.provider}/${ref.model}`); - }; - - addRef( - resolveConfiguredModelRef({ - cfg, - defaultProvider: DEFAULT_PROVIDER, - defaultModel: DEFAULT_MODEL, - }), - ); - - const modelConfig = cfg.agents?.defaults?.model as - | { primary?: string; fallbacks?: string[] } - | undefined; - const imageModelConfig = cfg.agents?.defaults?.imageModel as - | { primary?: string; fallbacks?: string[] } - | undefined; - - const primary = modelConfig?.primary?.trim() ?? ""; - const fallbacks = modelConfig?.fallbacks ?? []; - const imagePrimary = imageModelConfig?.primary?.trim() ?? ""; - const imageFallbacks = imageModelConfig?.fallbacks ?? []; - - const addRaw = (raw: string) => { - const resolved = resolveModelRefFromString({ - raw, - defaultProvider: DEFAULT_PROVIDER, - aliasIndex, - }); - if (resolved) addRef(resolved.ref); - }; - - if (primary) addRaw(primary); - for (const raw of fallbacks) addRaw(String(raw ?? "")); - if (imagePrimary) addRaw(imagePrimary); - for (const raw of imageFallbacks) addRaw(String(raw ?? "")); - - for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) { - const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER); - if (parsed) addRef(parsed); - } - - return order; -} - describeLive("live models (profile keys)", () => { it( - "completes across configured models", + "completes across selected models", async () => { const cfg = loadConfig(); await ensureClawdbotModelsJson(cfg); + if (!DIRECT_ENABLED) { + logProgress( + "[live-models] skipping (set CLAWDBOT_LIVE_MODELS=modern|all|; all=modern)", + ); + return; + } + const anthropicKeys = collectAnthropicApiKeys(); + if (anthropicKeys.length > 0) { + process.env.ANTHROPIC_API_KEY = anthropicKeys[0]; + logProgress( + `[live-models] anthropic keys loaded: ${anthropicKeys.length}`, + ); + } const agentDir = resolveClawdbotAgentDir(); const authStorage = discoverAuthStorage(agentDir); @@ -205,7 +155,11 @@ describeLive("live models (profile keys)", () => { models.map((model) => [`${model.provider}/${model.id}`, model]), ); - const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS); + const rawModels = process.env.CLAWDBOT_LIVE_MODELS?.trim(); + const useModern = rawModels === "modern" || rawModels === "all"; + const useExplicit = Boolean(rawModels) && !useModern; + const filter = useExplicit ? parseModelFilter(rawModels) : null; + const allowNotFoundSkip = useModern; const providers = parseProviderFilter( process.env.CLAWDBOT_LIVE_PROVIDERS, ); @@ -216,149 +170,196 @@ describeLive("live models (profile keys)", () => { const failures: Array<{ model: string; error: string }> = []; const skipped: Array<{ model: string; reason: string }> = []; + const candidates: Array<{ + model: Model; + apiKeyInfo: Awaited>; + }> = []; - const configuredKeys = resolveConfiguredModelKeys(cfg); - - for (const key of configuredKeys) { - const model = modelByKey.get(key); - if (!model) { - skipped.push({ - model: key, - reason: "configured model missing in registry", - }); - continue; - } + for (const model of models) { if (providers && !providers.has(model.provider)) continue; const id = `${model.provider}/${model.id}`; if (filter && !filter.has(id)) continue; - - let apiKeyInfo: Awaited>; - try { - apiKeyInfo = await getApiKeyForModel({ model, cfg }); - } catch (err) { - skipped.push({ model: id, reason: String(err) }); - continue; - } - - if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) { - skipped.push({ - model: id, - reason: `non-profile credential source: ${apiKeyInfo.source}`, - }); - continue; - } - - try { - // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns. - if ( - model.provider === "openai" && - model.api === "openai-responses" && - model.id === "gpt-5.2" - ) { - const noopTool = { - name: "noop", - description: "Return ok.", - parameters: Type.Object({}, { additionalProperties: false }), - }; - - const first = await completeSimpleWithTimeout( - model, - { - messages: [ - { - role: "user", - content: - "Call the tool `noop` with {}. Do not write any other text.", - timestamp: Date.now(), - }, - ], - tools: [noopTool], - }, - { - apiKey: apiKeyInfo.apiKey, - reasoning: model.reasoning ? "low" : undefined, - maxTokens: 128, - }, - perModelTimeoutMs, - ); - - const toolCall = first.content.find((b) => b.type === "toolCall"); - expect(toolCall).toBeTruthy(); - if (!toolCall || toolCall.type !== "toolCall") { - throw new Error("expected tool call"); - } - - const second = await completeSimpleWithTimeout( - model, - { - messages: [ - { - role: "user", - content: - "Call the tool `noop` with {}. Do not write any other text.", - timestamp: Date.now(), - }, - first, - { - role: "toolResult", - toolCallId: toolCall.id, - toolName: "noop", - content: [{ type: "text", text: "ok" }], - isError: false, - timestamp: Date.now(), - }, - { - role: "user", - content: "Reply with the word ok.", - timestamp: Date.now(), - }, - ], - }, - { - apiKey: apiKeyInfo.apiKey, - reasoning: model.reasoning ? "low" : undefined, - maxTokens: 64, - }, - perModelTimeoutMs, - ); - - const secondText = second.content - .filter((b) => b.type === "text") - .map((b) => b.text.trim()) - .join(" "); - expect(secondText.length).toBeGreaterThan(0); + if (!filter && useModern) { + if (!isModernModelRef({ provider: model.provider, id: model.id })) { continue; } - - const ok = await completeOkWithRetry({ - model, - apiKey: apiKeyInfo.apiKey, - timeoutMs: perModelTimeoutMs, - }); - - if (ok.res.stopReason === "error") { - const msg = ok.res.errorMessage ?? ""; - if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) { - skipped.push({ model: id, reason: msg }); - continue; - } - throw new Error(msg || "model returned error with no message"); - } - - if (ok.text.length === 0 && model.provider === "google") { + } + try { + const apiKeyInfo = await getApiKeyForModel({ model, cfg }); + if ( + REQUIRE_PROFILE_KEYS && + !apiKeyInfo.source.startsWith("profile:") + ) { skipped.push({ model: id, - reason: "no text returned (likely unavailable model id)", + reason: `non-profile credential source: ${apiKeyInfo.source}`, }); continue; } - expect(ok.text.length).toBeGreaterThan(0); + candidates.push({ model, apiKeyInfo }); } catch (err) { - if (model.provider === "google" && isGoogleModelNotFoundError(err)) { - skipped.push({ model: id, reason: String(err) }); - continue; + skipped.push({ model: id, reason: String(err) }); + } + } + + if (candidates.length === 0) { + logProgress("[live-models] no API keys found; skipping"); + return; + } + + logProgress( + `[live-models] selection=${useExplicit ? "explicit" : "modern"}`, + ); + logProgress(`[live-models] running ${candidates.length} models`); + const total = candidates.length; + + for (const [index, entry] of candidates.entries()) { + const { model, apiKeyInfo } = entry; + const id = `${model.provider}/${model.id}`; + const progressLabel = `[live-models] ${index + 1}/${total} ${id}`; + const attemptMax = + model.provider === "anthropic" && anthropicKeys.length > 0 + ? anthropicKeys.length + : 1; + for (let attempt = 0; attempt < attemptMax; attempt += 1) { + if (model.provider === "anthropic" && anthropicKeys.length > 0) { + process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt]; + } + const apiKey = + model.provider === "anthropic" && anthropicKeys.length > 0 + ? anthropicKeys[attempt] + : apiKeyInfo.apiKey; + try { + // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns. + if ( + model.provider === "openai" && + model.api === "openai-responses" && + model.id === "gpt-5.2" + ) { + logProgress(`${progressLabel}: tool-only regression`); + const noopTool = { + name: "noop", + description: "Return ok.", + parameters: Type.Object({}, { additionalProperties: false }), + }; + + const first = await completeSimpleWithTimeout( + model, + { + messages: [ + { + role: "user", + content: + "Call the tool `noop` with {}. Do not write any other text.", + timestamp: Date.now(), + }, + ], + tools: [noopTool], + }, + { + apiKey, + reasoning: model.reasoning ? "low" : undefined, + maxTokens: 128, + }, + perModelTimeoutMs, + ); + + const toolCall = first.content.find((b) => b.type === "toolCall"); + expect(toolCall).toBeTruthy(); + if (!toolCall || toolCall.type !== "toolCall") { + throw new Error("expected tool call"); + } + + const second = await completeSimpleWithTimeout( + model, + { + messages: [ + { + role: "user", + content: + "Call the tool `noop` with {}. Do not write any other text.", + timestamp: Date.now(), + }, + first, + { + role: "toolResult", + toolCallId: toolCall.id, + toolName: "noop", + content: [{ type: "text", text: "ok" }], + isError: false, + timestamp: Date.now(), + }, + { + role: "user", + content: "Reply with the word ok.", + timestamp: Date.now(), + }, + ], + }, + { + apiKey, + reasoning: model.reasoning ? "low" : undefined, + maxTokens: 64, + }, + perModelTimeoutMs, + ); + + const secondText = second.content + .filter((b) => b.type === "text") + .map((b) => b.text.trim()) + .join(" "); + expect(secondText.length).toBeGreaterThan(0); + logProgress(`${progressLabel}: done`); + break; + } + + logProgress(`${progressLabel}: prompt`); + const ok = await completeOkWithRetry({ + model, + apiKey, + timeoutMs: perModelTimeoutMs, + }); + + if (ok.res.stopReason === "error") { + const msg = ok.res.errorMessage ?? ""; + if (allowNotFoundSkip && isModelNotFoundErrorMessage(msg)) { + skipped.push({ model: id, reason: msg }); + logProgress(`${progressLabel}: skip (model not found)`); + break; + } + throw new Error(msg || "model returned error with no message"); + } + + if (ok.text.length === 0 && model.provider === "google") { + skipped.push({ + model: id, + reason: "no text returned (likely unavailable model id)", + }); + logProgress(`${progressLabel}: skip (google model not found)`); + break; + } + expect(ok.text.length).toBeGreaterThan(0); + logProgress(`${progressLabel}: done`); + break; + } catch (err) { + const message = String(err); + if ( + model.provider === "anthropic" && + isAnthropicRateLimitError(message) && + attempt + 1 < attemptMax + ) { + logProgress(`${progressLabel}: rate limit, retrying with next key`); + continue; + } + if (model.provider === "google" && isGoogleModelNotFoundError(err)) { + skipped.push({ model: id, reason: message }); + logProgress(`${progressLabel}: skip (google model not found)`); + break; + } + logProgress(`${progressLabel}: failed`); + failures.push({ model: id, error: message }); + break; } - failures.push({ model: id, error: String(err) }); } } @@ -372,8 +373,6 @@ describeLive("live models (profile keys)", () => { ); } - // Keep one assertion so the test fails loudly if we somehow ran nothing. - expect(models.length).toBeGreaterThan(0); void skipped; }, 15 * 60 * 1000, diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index 358804303..d2089f72a 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -11,9 +11,15 @@ import { } from "@mariozechner/pi-coding-agent"; import { describe, expect, it } from "vitest"; import { resolveClawdbotAgentDir } from "../agents/agent-paths.js"; +import { + collectAnthropicApiKeys, + isAnthropicRateLimitError, +} from "../agents/live-auth-keys.js"; +import { isModernModelRef } from "../agents/live-model-filter.js"; import { getApiKeyForModel } from "../agents/model-auth.js"; import { ensureClawdbotModelsJson } from "../agents/models-config.js"; import { loadConfig } from "../config/config.js"; +import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js"; import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES, @@ -25,16 +31,14 @@ import { startGatewayServer } from "./server.js"; const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1"; const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1"; -const ALL_MODELS = - process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" || - process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all"; -const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1"; -const EXTRA_IMAGE_PROBES = - process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1"; const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1"; const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS); +const THINKING_LEVEL = "high"; +const THINKING_TAG_RE = + /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; +const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i; -const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip; +const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip; function parseFilter(raw?: string): Set | null { const trimmed = raw?.trim(); @@ -46,6 +50,26 @@ function parseFilter(raw?: string): Set | null { return ids.length ? new Set(ids) : null; } +function logProgress(message: string): void { + console.log(`[live] ${message}`); +} + +function assertNoReasoningTags(params: { + text: string; + model: string; + phase: string; + label: string; +}): void { + if (!params.text) return; + if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) { + const snippet = + params.text.length > 200 ? `${params.text.slice(0, 200)}…` : params.text; + throw new Error( + `[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`, + ); + } +} + function extractPayloadText(result: unknown): string { const record = result as Record; const payloads = Array.isArray(record.payloads) ? record.payloads : []; @@ -200,61 +224,470 @@ async function connectClient(params: { url: string; token: string }) { }); } +type GatewayModelSuiteParams = { + label: string; + cfg: ClawdbotConfig; + candidates: Array>; + extraToolProbes: boolean; + extraImageProbes: boolean; + thinkingLevel: string; + providerOverrides?: Record; +}; + +function buildLiveGatewayConfig(params: { + cfg: ClawdbotConfig; + candidates: Array>; + providerOverrides?: Record; +}): ClawdbotConfig { + const lmstudioProvider = params.cfg.models?.providers?.lmstudio; + const baseProviders = params.cfg.models?.providers ?? {}; + const nextProviders = { + ...baseProviders, + ...(lmstudioProvider + ? { + lmstudio: { + ...lmstudioProvider, + api: "openai-completions", + }, + } + : {}), + ...(params.providerOverrides ?? {}), + }; + const providers = + Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders; + return { + ...params.cfg, + agents: { + ...params.cfg.agents, + list: (params.cfg.agents?.list ?? []).map((entry) => ({ + ...entry, + sandbox: { mode: "off" }, + })), + defaults: { + ...params.cfg.agents?.defaults, + // Live tests should avoid Docker sandboxing so tool probes can + // operate on the temporary probe files we create in the host workspace. + sandbox: { mode: "off" }, + models: Object.fromEntries( + params.candidates.map((m) => [`${m.provider}/${m.id}`, {}]), + ), + }, + }, + models: + Object.keys(providers).length > 0 + ? { ...params.cfg.models, providers } + : params.cfg.models, + }; +} + +function buildMinimaxProviderOverride(params: { + cfg: ClawdbotConfig; + api: "openai-completions" | "anthropic-messages"; + baseUrl: string; +}): ModelProviderConfig | null { + const existing = params.cfg.models?.providers?.minimax; + if (!existing || !Array.isArray(existing.models) || existing.models.length === 0) + return null; + return { + ...existing, + api: params.api, + baseUrl: params.baseUrl, + }; +} + +async function runGatewayModelSuite(params: GatewayModelSuiteParams) { + const previous = { + configPath: process.env.CLAWDBOT_CONFIG_PATH, + token: process.env.CLAWDBOT_GATEWAY_TOKEN, + skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS, + skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER, + skipCron: process.env.CLAWDBOT_SKIP_CRON, + skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST, + }; + + process.env.CLAWDBOT_SKIP_PROVIDERS = "1"; + process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1"; + process.env.CLAWDBOT_SKIP_CRON = "1"; + process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1"; + + const token = `test-${randomUUID()}`; + process.env.CLAWDBOT_GATEWAY_TOKEN = token; + + const workspaceDir = resolveUserPath( + params.cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"), + ); + await fs.mkdir(workspaceDir, { recursive: true }); + const nonceA = randomUUID(); + const nonceB = randomUUID(); + const toolProbePath = path.join( + workspaceDir, + `.clawdbot-live-tool-probe.${nonceA}.txt`, + ); + await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`); + + const nextCfg = buildLiveGatewayConfig({ + cfg: params.cfg, + candidates: params.candidates, + providerOverrides: params.providerOverrides, + }); + const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-")); + const tempConfigPath = path.join(tempDir, "clawdbot.json"); + await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`); + process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath; + + await ensureClawdbotModelsJson(nextCfg); + + const port = await getFreeGatewayPort(); + const server = await startGatewayServer(port, { + bind: "loopback", + auth: { mode: "token", token }, + controlUiEnabled: false, + }); + + const client = await connectClient({ + url: `ws://127.0.0.1:${port}`, + token, + }); + + try { + logProgress( + `[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`, + ); + const anthropicKeys = collectAnthropicApiKeys(); + if (anthropicKeys.length > 0) { + process.env.ANTHROPIC_API_KEY = anthropicKeys[0]; + logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`); + } + const sessionKey = `agent:dev:${params.label}`; + const failures: Array<{ model: string; error: string }> = []; + const total = params.candidates.length; + + for (const [index, model] of params.candidates.entries()) { + const modelKey = `${model.provider}/${model.id}`; + const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`; + + const attemptMax = + model.provider === "anthropic" && anthropicKeys.length > 0 + ? anthropicKeys.length + : 1; + + for (let attempt = 0; attempt < attemptMax; attempt += 1) { + if (model.provider === "anthropic" && anthropicKeys.length > 0) { + process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt]; + } + try { + // Ensure session exists + override model for this run. + await client.request>("sessions.patch", { + key: sessionKey, + model: modelKey, + }); + // Reset between models: avoids cross-provider transcript incompatibilities + // (notably OpenAI Responses requiring reasoning replay for function_call items). + await client.request>("sessions.reset", { + key: sessionKey, + }); + + logProgress(`${progressLabel}: prompt`); + const runId = randomUUID(); + const payload = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId}`, + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + + if (payload?.status !== "ok") { + throw new Error(`agent status=${String(payload?.status)}`); + } + const text = extractPayloadText(payload?.result); + if (model.provider === "google" && isGoogleModelNotFoundText(text)) { + // Catalog drift: model IDs can disappear or become unavailable on the API. + // Treat as skip when scanning "all models" for Google. + logProgress(`${progressLabel}: skip (google model not found)`); + break; + } + assertNoReasoningTags({ + text, + model: modelKey, + phase: "prompt", + label: params.label, + }); + if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`); + if ( + !/\bmicro\s*-?\s*tasks?\b/i.test(text) || + !/\bmacro\s*-?\s*tasks?\b/i.test(text) + ) { + throw new Error(`missing required keywords: ${text}`); + } + + // Real tool invocation: force the agent to Read a local file and echo a nonce. + logProgress(`${progressLabel}: tool-read`); + const runIdTool = randomUUID(); + const toolProbe = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdTool}-tool`, + message: + "Clawdbot live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + "Then reply with the two nonce values you read (include both).", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + if (toolProbe?.status !== "ok") { + throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`); + } + const toolText = extractPayloadText(toolProbe?.result); + assertNoReasoningTags({ + text: toolText, + model: modelKey, + phase: "tool-read", + label: params.label, + }); + if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) { + throw new Error(`tool probe missing nonce: ${toolText}`); + } + + if (params.extraToolProbes) { + logProgress(`${progressLabel}: tool-exec`); + const nonceC = randomUUID(); + const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`); + + const execReadProbe = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdTool}-exec-read`, + message: + "Clawdbot live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + "Finally reply including the nonce text you read back.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + if (execReadProbe?.status !== "ok") { + throw new Error( + `exec+read probe failed: status=${String(execReadProbe?.status)}`, + ); + } + const execReadText = extractPayloadText(execReadProbe?.result); + assertNoReasoningTags({ + text: execReadText, + model: modelKey, + phase: "tool-exec", + label: params.label, + }); + if (!execReadText.includes(nonceC)) { + throw new Error(`exec+read probe missing nonce: ${execReadText}`); + } + + await fs.rm(toolWritePath, { force: true }); + } + + if (params.extraImageProbes && model.input?.includes("image")) { + logProgress(`${progressLabel}: image`); + const imageCode = randomImageProbeCode(10); + const imageBase64 = renderCatNoncePngBase64(imageCode); + const runIdImage = randomUUID(); + + const imageProbe = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdImage}-image`, + message: + "Look at the attached image. Reply with exactly two tokens separated by a single space: " + + "(1) the animal shown or written in the image, lowercase; " + + "(2) the code printed in the image, uppercase. No extra text.", + attachments: [ + { + mimeType: "image/png", + fileName: `probe-${runIdImage}.png`, + content: imageBase64, + }, + ], + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + if (imageProbe?.status !== "ok") { + throw new Error( + `image probe failed: status=${String(imageProbe?.status)}`, + ); + } + const imageText = extractPayloadText(imageProbe?.result); + assertNoReasoningTags({ + text: imageText, + model: modelKey, + phase: "image", + label: params.label, + }); + if (!/\bcat\b/i.test(imageText)) { + throw new Error(`image probe missing 'cat': ${imageText}`); + } + const candidates = + imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; + const bestDistance = candidates.reduce((best, cand) => { + if (Math.abs(cand.length - imageCode.length) > 2) return best; + return Math.min(best, editDistance(cand, imageCode)); + }, Number.POSITIVE_INFINITY); + if (!(bestDistance <= 2)) { + throw new Error( + `image probe missing code (${imageCode}): ${imageText}`, + ); + } + } + + // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class). + if ( + (model.provider === "openai" && model.api === "openai-responses") || + (model.provider === "openai-codex" && + model.api === "openai-codex-responses") + ) { + logProgress(`${progressLabel}: tool-only regression`); + const runId2 = randomUUID(); + const first = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId2}-1`, + message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + if (first?.status !== "ok") { + throw new Error(`tool-only turn failed: status=${String(first?.status)}`); + } + const firstText = extractPayloadText(first?.result); + assertNoReasoningTags({ + text: firstText, + model: modelKey, + phase: "tool-only", + label: params.label, + }); + + const second = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runId2}-2`, + message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + if (second?.status !== "ok") { + throw new Error( + `post-tool message failed: status=${String(second?.status)}`, + ); + } + const reply = extractPayloadText(second?.result); + assertNoReasoningTags({ + text: reply, + model: modelKey, + phase: "tool-only-followup", + label: params.label, + }); + if (!reply.includes(nonceA) || !reply.includes(nonceB)) { + throw new Error(`unexpected reply: ${reply}`); + } + } + + logProgress(`${progressLabel}: done`); + break; + } catch (err) { + const message = String(err); + if ( + model.provider === "anthropic" && + isAnthropicRateLimitError(message) && + attempt + 1 < attemptMax + ) { + logProgress(`${progressLabel}: rate limit, retrying with next key`); + continue; + } + // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests. + if ( + model.provider === "openai-codex" && + isRefreshTokenReused(message) + ) { + logProgress(`${progressLabel}: skip (codex refresh token reused)`); + break; + } + logProgress(`${progressLabel}: failed`); + failures.push({ model: modelKey, error: message }); + break; + } + } + } + + if (failures.length > 0) { + const preview = failures + .slice(0, 20) + .map((f) => `- ${f.model}: ${f.error}`) + .join("\n"); + throw new Error( + `gateway live model failures (${failures.length}):\n${preview}`, + ); + } + } finally { + client.stop(); + await server.close({ reason: "live test complete" }); + await fs.rm(toolProbePath, { force: true }); + await fs.rm(tempDir, { recursive: true, force: true }); + + process.env.CLAWDBOT_CONFIG_PATH = previous.configPath; + process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token; + process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders; + process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail; + process.env.CLAWDBOT_SKIP_CRON = previous.skipCron; + process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas; + } +} + describeLive("gateway live (dev agent, profile keys)", () => { it( "runs meaningful prompts across models with available keys", async () => { - const previous = { - configPath: process.env.CLAWDBOT_CONFIG_PATH, - token: process.env.CLAWDBOT_GATEWAY_TOKEN, - skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS, - skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER, - skipCron: process.env.CLAWDBOT_SKIP_CRON, - skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST, - }; - - process.env.CLAWDBOT_SKIP_PROVIDERS = "1"; - process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1"; - process.env.CLAWDBOT_SKIP_CRON = "1"; - process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1"; - - const token = `test-${randomUUID()}`; - process.env.CLAWDBOT_GATEWAY_TOKEN = token; - const cfg = loadConfig(); await ensureClawdbotModelsJson(cfg); - const workspaceDir = resolveUserPath( - cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"), - ); - await fs.mkdir(workspaceDir, { recursive: true }); - const nonceA = randomUUID(); - const nonceB = randomUUID(); - const toolProbePath = path.join( - workspaceDir, - `.clawdbot-live-tool-probe.${nonceA}.txt`, - ); - await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`); - const agentDir = resolveClawdbotAgentDir(); const authStorage = discoverAuthStorage(agentDir); const modelRegistry = discoverModels(authStorage, agentDir); const all = modelRegistry.getAll() as Array>; - const filter = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_MODELS); - - // Default: honor user allowlist. Opt-in: scan all models with keys. - const allowlistKeys = Object.keys(cfg.agents?.defaults?.models ?? {}); - const wanted = - ALL_MODELS || allowlistKeys.length === 0 - ? all - : all.filter((m) => allowlistKeys.includes(`${m.provider}/${m.id}`)); + const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim(); + const useModern = + !rawModels || rawModels === "modern" || rawModels === "all"; + const useExplicit = Boolean(rawModels) && !useModern; + const filter = useExplicit ? parseFilter(rawModels) : null; + const wanted = filter + ? all.filter((m) => filter.has(`${m.provider}/${m.id}`)) + : all.filter((m) => + isModernModelRef({ provider: m.provider, id: m.id }), + ); const candidates: Array> = []; for (const model of wanted) { const id = `${model.provider}/${model.id}`; if (PROVIDERS && !PROVIDERS.has(model.provider)) continue; - if (filter && !filter.has(id)) continue; try { // eslint-disable-next-line no-await-in-loop await getApiKeyForModel({ model, cfg }); @@ -264,315 +697,72 @@ describeLive("gateway live (dev agent, profile keys)", () => { } } - expect(candidates.length).toBeGreaterThan(0); - const imageCandidates = EXTRA_IMAGE_PROBES - ? candidates.filter((m) => m.input?.includes("image")) - : []; - if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) { - throw new Error( - "image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model", + if (candidates.length === 0) { + logProgress("[all-models] no API keys found; skipping"); + return; + } + logProgress( + `[all-models] selection=${useExplicit ? "explicit" : "modern"}`, + ); + const imageCandidates = candidates.filter((m) => + m.input?.includes("image"), + ); + if (imageCandidates.length === 0) { + logProgress( + "[all-models] no image-capable models selected; image probe will be skipped", ); } - - // Build a temp config that allows all selected models, so session overrides stick. - const lmstudioProvider = cfg.models?.providers?.lmstudio; - const nextCfg = { - ...cfg, - agents: { - ...cfg.agents, - list: (cfg.agents?.list ?? []).map((entry) => ({ - ...entry, - sandbox: { mode: "off" }, - })), - defaults: { - ...cfg.agents?.defaults, - // Live tests should avoid Docker sandboxing so tool probes can - // operate on the temporary probe files we create in the host workspace. - sandbox: { mode: "off" }, - models: Object.fromEntries( - candidates.map((m) => [`${m.provider}/${m.id}`, {}]), - ), - }, - }, - models: { - ...cfg.models, - providers: { - ...cfg.models?.providers, - // LM Studio is most reliable via Chat Completions; its Responses API - // tool-calling behavior is inconsistent across releases. - ...(lmstudioProvider - ? { - lmstudio: { - ...lmstudioProvider, - api: "openai-completions", - }, - } - : {}), - }, - }, - }; - const tempDir = await fs.mkdtemp( - path.join(os.tmpdir(), "clawdbot-live-"), - ); - const tempConfigPath = path.join(tempDir, "clawdbot.json"); - await fs.writeFile( - tempConfigPath, - `${JSON.stringify(nextCfg, null, 2)}\n`, - ); - process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath; - - const port = await getFreeGatewayPort(); - const server = await startGatewayServer(port, { - bind: "loopback", - auth: { mode: "token", token }, - controlUiEnabled: false, + await runGatewayModelSuite({ + label: "all-models", + cfg, + candidates, + extraToolProbes: true, + extraImageProbes: true, + thinkingLevel: THINKING_LEVEL, }); - const client = await connectClient({ - url: `ws://127.0.0.1:${port}`, - token, + const minimaxCandidates = candidates.filter((model) => model.provider === "minimax"); + if (minimaxCandidates.length === 0) { + logProgress("[minimax] no candidates with keys; skipping dual endpoint probes"); + return; + } + + const minimaxOpenAi = buildMinimaxProviderOverride({ + cfg, + api: "openai-completions", + baseUrl: "https://api.minimax.io/v1", }); + if (minimaxOpenAi) { + await runGatewayModelSuite({ + label: "minimax-openai", + cfg, + candidates: minimaxCandidates, + extraToolProbes: true, + extraImageProbes: true, + thinkingLevel: THINKING_LEVEL, + providerOverrides: { minimax: minimaxOpenAi }, + }); + } else { + logProgress("[minimax-openai] missing minimax provider config; skipping"); + } - try { - const sessionKey = "agent:dev:live-gateway"; - - const failures: Array<{ model: string; error: string }> = []; - - for (const model of candidates) { - const modelKey = `${model.provider}/${model.id}`; - - try { - // Ensure session exists + override model for this run. - await client.request>("sessions.patch", { - key: sessionKey, - model: modelKey, - }); - // Reset between models: avoids cross-provider transcript incompatibilities - // (notably OpenAI Responses requiring reasoning replay for function_call items). - await client.request>("sessions.reset", { - key: sessionKey, - }); - - // “Meaningful” direct prompt (no tools). - const runId = randomUUID(); - const payload = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId}`, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - deliver: false, - }, - { expectFinal: true }, - ); - - if (payload?.status !== "ok") { - throw new Error(`agent status=${String(payload?.status)}`); - } - const text = extractPayloadText(payload?.result); - if ( - model.provider === "google" && - isGoogleModelNotFoundText(text) - ) { - // Catalog drift: model IDs can disappear or become unavailable on the API. - // Treat as skip when scanning "all models" for Google. - continue; - } - if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`); - if ( - !/\bmicro\s*-?\s*tasks?\b/i.test(text) || - !/\bmacro\s*-?\s*tasks?\b/i.test(text) - ) { - throw new Error(`missing required keywords: ${text}`); - } - - // Real tool invocation: force the agent to Read a local file and echo a nonce. - const runIdTool = randomUUID(); - const toolProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-tool`, - message: - "Clawdbot live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - "Then reply with the two nonce values you read (include both).", - deliver: false, - }, - { expectFinal: true }, - ); - if (toolProbe?.status !== "ok") { - throw new Error( - `tool probe failed: status=${String(toolProbe?.status)}`, - ); - } - const toolText = extractPayloadText(toolProbe?.result); - if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) { - throw new Error(`tool probe missing nonce: ${toolText}`); - } - - if (EXTRA_TOOL_PROBES) { - const nonceC = randomUUID(); - const toolWritePath = path.join( - tempDir, - `write-${runIdTool}.txt`, - ); - - const execReadProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-exec-read`, - message: - "Clawdbot live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - "Finally reply including the nonce text you read back.", - deliver: false, - }, - { expectFinal: true }, - ); - if (execReadProbe?.status !== "ok") { - throw new Error( - `exec+read probe failed: status=${String(execReadProbe?.status)}`, - ); - } - const execReadText = extractPayloadText(execReadProbe?.result); - if (!execReadText.includes(nonceC)) { - throw new Error( - `exec+read probe missing nonce: ${execReadText}`, - ); - } - - await fs.rm(toolWritePath, { force: true }); - } - - if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) { - const imageCode = randomImageProbeCode(10); - const imageBase64 = renderCatNoncePngBase64(imageCode); - const runIdImage = randomUUID(); - - const imageProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdImage}-image`, - message: - "Look at the attached image. Reply with exactly two tokens separated by a single space: " + - "(1) the animal shown or written in the image, lowercase; " + - "(2) the code printed in the image, uppercase. No extra text.", - attachments: [ - { - mimeType: "image/png", - fileName: `probe-${runIdImage}.png`, - content: imageBase64, - }, - ], - deliver: false, - }, - { expectFinal: true }, - ); - if (imageProbe?.status !== "ok") { - throw new Error( - `image probe failed: status=${String(imageProbe?.status)}`, - ); - } - const imageText = extractPayloadText(imageProbe?.result); - if (!/\bcat\b/i.test(imageText)) { - throw new Error(`image probe missing 'cat': ${imageText}`); - } - const candidates = - imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; - const bestDistance = candidates.reduce((best, cand) => { - if (Math.abs(cand.length - imageCode.length) > 2) return best; - return Math.min(best, editDistance(cand, imageCode)); - }, Number.POSITIVE_INFINITY); - if (!(bestDistance <= 2)) { - throw new Error( - `image probe missing code (${imageCode}): ${imageText}`, - ); - } - } - // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class). - if ( - (model.provider === "openai" && - model.api === "openai-responses") || - (model.provider === "openai-codex" && - model.api === "openai-codex-responses") - ) { - const runId2 = randomUUID(); - const first = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId2}-1`, - message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, - deliver: false, - }, - { expectFinal: true }, - ); - if (first?.status !== "ok") { - throw new Error( - `tool-only turn failed: status=${String(first?.status)}`, - ); - } - - const second = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId2}-2`, - message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, - deliver: false, - }, - { expectFinal: true }, - ); - if (second?.status !== "ok") { - throw new Error( - `post-tool message failed: status=${String(second?.status)}`, - ); - } - const reply = extractPayloadText(second?.result); - if (!reply.includes(nonceA) || !reply.includes(nonceB)) { - throw new Error(`unexpected reply: ${reply}`); - } - } - } catch (err) { - const message = String(err); - // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests. - if ( - model.provider === "openai-codex" && - isRefreshTokenReused(message) - ) { - continue; - } - failures.push({ model: modelKey, error: message }); - } - } - - if (failures.length > 0) { - const preview = failures - .slice(0, 20) - .map((f) => `- ${f.model}: ${f.error}`) - .join("\n"); - throw new Error( - `gateway live model failures (${failures.length}):\n${preview}`, - ); - } - } finally { - client.stop(); - await server.close({ reason: "live test complete" }); - await fs.rm(toolProbePath, { force: true }); - await fs.rm(tempDir, { recursive: true, force: true }); - - process.env.CLAWDBOT_CONFIG_PATH = previous.configPath; - process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token; - process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders; - process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail; - process.env.CLAWDBOT_SKIP_CRON = previous.skipCron; - process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas; + const minimaxAnthropic = buildMinimaxProviderOverride({ + cfg, + api: "anthropic-messages", + baseUrl: "https://api.minimax.io/anthropic", + }); + if (minimaxAnthropic) { + await runGatewayModelSuite({ + label: "minimax-anthropic", + cfg, + candidates: minimaxCandidates, + extraToolProbes: true, + extraImageProbes: true, + thinkingLevel: THINKING_LEVEL, + providerOverrides: { minimax: minimaxAnthropic }, + }); + } else { + logProgress("[minimax-anthropic] missing minimax provider config; skipping"); } }, 20 * 60 * 1000, @@ -661,6 +851,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { message: `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`, + thinking: THINKING_LEVEL, deliver: false, }, { expectFinal: true }, @@ -671,6 +862,12 @@ describeLive("gateway live (dev agent, profile keys)", () => { ); } const toolText = extractPayloadText(toolProbe?.result); + assertNoReasoningTags({ + text: toolText, + model: "anthropic/claude-opus-4-5", + phase: "zai-fallback-tool", + label: "zai-fallback", + }); if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) { throw new Error(`anthropic tool probe missing nonce: ${toolText}`); } @@ -689,6 +886,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { message: `What are the values of nonceA and nonceB in "${toolProbePath}"? ` + `Reply with exactly: ${nonceA} ${nonceB}.`, + thinking: THINKING_LEVEL, deliver: false, }, { expectFinal: true }, @@ -699,6 +897,12 @@ describeLive("gateway live (dev agent, profile keys)", () => { ); } const followupText = extractPayloadText(followup?.result); + assertNoReasoningTags({ + text: followupText, + model: "zai/glm-4.7", + phase: "zai-fallback-followup", + label: "zai-fallback", + }); if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) { throw new Error(`zai followup missing nonce: ${followupText}`); } diff --git a/test/test-env.ts b/test/test-env.ts index 1b931ac63..3672ff980 100644 --- a/test/test-env.ts +++ b/test/test-env.ts @@ -1,3 +1,4 @@ +import { execFileSync } from "node:child_process"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; @@ -11,6 +12,37 @@ function restoreEnv(entries: RestoreEntry[]): void { } } +function loadProfileEnv(): void { + const profilePath = path.join(os.homedir(), ".profile"); + if (!fs.existsSync(profilePath)) return; + try { + const output = execFileSync( + "/bin/bash", + [ + "-lc", + `set -a; source \"${profilePath}\" >/dev/null 2>&1; env -0`, + ], + { encoding: "utf8" }, + ); + const entries = output.split("\0"); + let applied = 0; + for (const entry of entries) { + if (!entry) continue; + const idx = entry.indexOf("="); + if (idx <= 0) continue; + const key = entry.slice(0, idx); + if (!key || (process.env[key] ?? "") !== "") continue; + process.env[key] = entry.slice(idx + 1); + applied += 1; + } + if (applied > 0) { + console.log(`[live] loaded ${applied} env vars from ~/.profile`); + } + } catch { + // ignore profile load failures + } +} + export function installTestEnv(): { cleanup: () => void; tempHome: string } { const live = process.env.LIVE === "1" || @@ -20,6 +52,7 @@ export function installTestEnv(): { cleanup: () => void; tempHome: string } { // Live tests must use the real user environment (keys, profiles, config). // The default test env isolates HOME to avoid touching real state. if (live) { + loadProfileEnv(); return { cleanup: () => {}, tempHome: process.env.HOME ?? "" }; }