fix: modernize live tests and gemini ids

2026-01-12 06:58:31 +00:00
parent 79cbb20988
commit 1850013cae
11 changed files with 1053 additions and 593 deletions
--- a/src/agents/live-auth-keys.ts
+++ b/src/agents/live-auth-keys.ts
@@ -0,0 +1,50 @@
+const KEY_SPLIT_RE = /[\s,;]+/g;
+
+function parseKeyList(raw?: string | null): string[] {
+  if (!raw) return [];
+  return raw
+    .split(KEY_SPLIT_RE)
+    .map((value) => value.trim())
+    .filter(Boolean);
+}
+
+function collectEnvPrefixedKeys(prefix: string): string[] {
+  const keys: string[] = [];
+  for (const [name, value] of Object.entries(process.env)) {
+    if (!name.startsWith(prefix)) continue;
+    const trimmed = value?.trim();
+    if (!trimmed) continue;
+    keys.push(trimmed);
+  }
+  return keys;
+}
+
+export function collectAnthropicApiKeys(): string[] {
+  const forcedSingle = process.env.CLAWDBOT_LIVE_ANTHROPIC_KEY?.trim();
+  if (forcedSingle) return [forcedSingle];
+
+  const fromList = parseKeyList(process.env.CLAWDBOT_LIVE_ANTHROPIC_KEYS);
+  const fromEnv = collectEnvPrefixedKeys("ANTHROPIC_API_KEY");
+  const primary = process.env.ANTHROPIC_API_KEY?.trim();
+
+  const seen = new Set<string>();
+  const add = (value?: string) => {
+    if (!value) return;
+    if (seen.has(value)) return;
+    seen.add(value);
+  };
+
+  for (const value of fromList) add(value);
+  if (primary) add(primary);
+  for (const value of fromEnv) add(value);
+
+  return Array.from(seen);
+}
+
+export function isAnthropicRateLimitError(message: string): boolean {
+  const lower = message.toLowerCase();
+  if (lower.includes("rate_limit")) return true;
+  if (lower.includes("rate limit")) return true;
+  if (lower.includes("429")) return true;
+  return false;
+}
--- a/src/agents/live-model-filter.ts
+++ b/src/agents/live-model-filter.ts
@@ -0,0 +1,89 @@
+export type ModelRef = {
+  provider?: string | null;
+  id?: string | null;
+};
+
+const ANTHROPIC_PREFIXES = [
+  "claude-opus-4-5",
+  "claude-sonnet-4-5",
+  "claude-haiku-4-5",
+];
+const OPENAI_MODELS = ["gpt-5.2", "gpt-5.0"];
+const CODEX_MODELS = [
+  "gpt-5.2",
+  "gpt-5.2-codex",
+  "gpt-5.1-codex",
+  "gpt-5.1-codex-mini",
+  "gpt-5.1-codex-max",
+];
+const GOOGLE_PREFIXES = ["gemini-3"];
+const ZAI_PREFIXES = ["glm-4.7"];
+const MINIMAX_PREFIXES = ["minimax-m2.1"];
+const XAI_PREFIXES = ["grok-4"];
+
+function matchesPrefix(id: string, prefixes: string[]): boolean {
+  return prefixes.some((prefix) => id.startsWith(prefix));
+}
+
+function matchesExactOrPrefix(id: string, values: string[]): boolean {
+  return values.some((value) => id === value || id.startsWith(value));
+}
+
+function matchesAny(id: string, values: string[]): boolean {
+  return values.some((value) => id.includes(value));
+}
+
+export function isModernModelRef(ref: ModelRef): boolean {
+  const provider = ref.provider?.trim().toLowerCase() ?? "";
+  const id = ref.id?.trim().toLowerCase() ?? "";
+  if (!provider || !id) return false;
+
+  if (provider === "anthropic") {
+    return matchesPrefix(id, ANTHROPIC_PREFIXES);
+  }
+
+  if (provider === "openai") {
+    return matchesExactOrPrefix(id, OPENAI_MODELS);
+  }
+
+  if (provider === "openai-codex") {
+    return matchesExactOrPrefix(id, CODEX_MODELS);
+  }
+
+  if (provider === "google" || provider === "google-gemini-cli") {
+    return matchesPrefix(id, GOOGLE_PREFIXES);
+  }
+
+  if (provider === "google-antigravity") {
+    return (
+      matchesPrefix(id, GOOGLE_PREFIXES) ||
+      matchesPrefix(id, ANTHROPIC_PREFIXES)
+    );
+  }
+
+  if (provider === "zai") {
+    return matchesPrefix(id, ZAI_PREFIXES);
+  }
+
+  if (provider === "minimax") {
+    return matchesPrefix(id, MINIMAX_PREFIXES);
+  }
+
+  if (provider === "xai") {
+    return matchesPrefix(id, XAI_PREFIXES);
+  }
+
+  if (provider === "openrouter" || provider === "opencode") {
+    return matchesAny(id, [
+      ...ANTHROPIC_PREFIXES,
+      ...OPENAI_MODELS,
+      ...CODEX_MODELS,
+      ...GOOGLE_PREFIXES,
+      ...ZAI_PREFIXES,
+      ...MINIMAX_PREFIXES,
+      ...XAI_PREFIXES,
+    ]);
+  }
+
+  return false;
+}
--- a/src/agents/models-config.test.ts
+++ b/src/agents/models-config.test.ts
@@ -117,4 +117,59 @@ describe("models config", () => {
      );
    });
  });
+
+  it("normalizes gemini 3 ids to preview for google providers", async () => {
+    await withTempHome(async () => {
+      vi.resetModules();
+      const { ensureClawdbotModelsJson } = await import("./models-config.js");
+      const { resolveClawdbotAgentDir } = await import("./agent-paths.js");
+
+      const cfg: ClawdbotConfig = {
+        models: {
+          providers: {
+            google: {
+              baseUrl: "https://generativelanguage.googleapis.com/v1beta",
+              apiKey: "GEMINI_KEY",
+              api: "google-generative-ai",
+              models: [
+                {
+                  id: "gemini-3-pro",
+                  name: "Gemini 3 Pro",
+                  api: "google-generative-ai",
+                  reasoning: true,
+                  input: ["text", "image"],
+                  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+                  contextWindow: 1048576,
+                  maxTokens: 65536,
+                },
+                {
+                  id: "gemini-3-flash",
+                  name: "Gemini 3 Flash",
+                  api: "google-generative-ai",
+                  reasoning: false,
+                  input: ["text", "image"],
+                  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+                  contextWindow: 1048576,
+                  maxTokens: 65536,
+                },
+              ],
+            },
+          },
+        },
+      };
+
+      await ensureClawdbotModelsJson(cfg);
+
+      const modelPath = path.join(resolveClawdbotAgentDir(), "models.json");
+      const raw = await fs.readFile(modelPath, "utf8");
+      const parsed = JSON.parse(raw) as {
+        providers: Record<string, { models: Array<{ id: string }> }>;
+      };
+      const ids = parsed.providers.google?.models?.map((model) => model.id);
+      expect(ids).toEqual([
+        "gemini-3-pro-preview",
+        "gemini-3-flash-preview",
+      ]);
+    });
+  });
 });
--- a/src/agents/models-config.ts
+++ b/src/agents/models-config.ts
@@ -5,6 +5,7 @@ import { type ClawdbotConfig, loadConfig } from "../config/config.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";

 type ModelsConfig = NonNullable<ClawdbotConfig["models"]>;
+type ProviderConfig = NonNullable<ModelsConfig["providers"]>[string];

 const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";

@@ -12,6 +13,38 @@ function isRecord(value: unknown): value is Record<string, unknown> {
  return Boolean(value && typeof value === "object" && !Array.isArray(value));
 }

+function normalizeGoogleModelId(id: string): string {
+  if (id === "gemini-3-pro") return "gemini-3-pro-preview";
+  if (id === "gemini-3-flash") return "gemini-3-flash-preview";
+  return id;
+}
+
+function normalizeGoogleProvider(provider: ProviderConfig): ProviderConfig {
+  let mutated = false;
+  const models = provider.models.map((model) => {
+    const nextId = normalizeGoogleModelId(model.id);
+    if (nextId === model.id) return model;
+    mutated = true;
+    return { ...model, id: nextId };
+  });
+  return mutated ? { ...provider, models } : provider;
+}
+
+function normalizeProviders(
+  providers: ModelsConfig["providers"],
+): ModelsConfig["providers"] {
+  if (!providers) return providers;
+  let mutated = false;
+  const next: Record<string, ProviderConfig> = {};
+  for (const [key, provider] of Object.entries(providers)) {
+    const normalized =
+      key === "google" ? normalizeGoogleProvider(provider) : provider;
+    if (normalized !== provider) mutated = true;
+    next[key] = normalized;
+  }
+  return mutated ? next : providers;
+}
+
 async function readJson(pathname: string): Promise<unknown> {
  try {
    const raw = await fs.readFile(pathname, "utf8");
@@ -53,7 +86,8 @@ export async function ensureClawdbotModelsJson(
    }
  }

-  const next = `${JSON.stringify({ providers: mergedProviders }, null, 2)}\n`;
+  const normalizedProviders = normalizeProviders(mergedProviders);
+  const next = `${JSON.stringify({ providers: normalizedProviders }, null, 2)}\n`;
  try {
    existingRaw = await fs.readFile(targetPath, "utf8");
  } catch {
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@@ -7,24 +7,20 @@ import { Type } from "@sinclair/typebox";
 import { describe, expect, it } from "vitest";
 import { loadConfig } from "../config/config.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";
-import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
-import { getApiKeyForModel } from "./model-auth.js";
 import {
-  buildModelAliasIndex,
-  parseModelRef,
-  resolveConfiguredModelRef,
-  resolveModelRefFromString,
-} from "./model-selection.js";
+  collectAnthropicApiKeys,
+  isAnthropicRateLimitError,
+} from "./live-auth-keys.js";
+import { isModernModelRef } from "./live-model-filter.js";
+import { getApiKeyForModel } from "./model-auth.js";
 import { ensureClawdbotModelsJson } from "./models-config.js";

 const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
-const ALL_MODELS =
-  process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
-  process.env.CLAWDBOT_LIVE_MODELS === "all";
+const DIRECT_ENABLED = Boolean(process.env.CLAWDBOT_LIVE_MODELS?.trim());
 const REQUIRE_PROFILE_KEYS =
  process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";

-const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
+const describeLive = LIVE ? describe : describe.skip;

 function parseProviderFilter(raw?: string): Set<string> | null {
  const trimmed = raw?.trim();
@@ -46,6 +42,10 @@ function parseModelFilter(raw?: string): Set<string> | null {
  return ids.length ? new Set(ids) : null;
 }

+function logProgress(message: string): void {
+  console.log(`[live] ${message}`);
+}
+
 function isGoogleModelNotFoundError(err: unknown): boolean {
  const msg = String(err);
  if (!/not found/i.test(msg)) return false;
@@ -127,75 +127,25 @@ async function completeOkWithRetry(params: {
  return await runOnce();
 }

-function resolveConfiguredModelKeys(
-  cfg: ReturnType<typeof loadConfig>,
-): string[] {
-  const aliasIndex = buildModelAliasIndex({
-    cfg,
-    defaultProvider: DEFAULT_PROVIDER,
-  });
-  const order: string[] = [];
-  const seen = new Set<string>();
-
-  const addKey = (key: string) => {
-    const normalized = key.trim();
-    if (!normalized || seen.has(normalized)) return;
-    seen.add(normalized);
-    order.push(normalized);
-  };
-
-  const addRef = (ref: { provider: string; model: string }) => {
-    addKey(`${ref.provider}/${ref.model}`);
-  };
-
-  addRef(
-    resolveConfiguredModelRef({
-      cfg,
-      defaultProvider: DEFAULT_PROVIDER,
-      defaultModel: DEFAULT_MODEL,
-    }),
-  );
-
-  const modelConfig = cfg.agents?.defaults?.model as
-    | { primary?: string; fallbacks?: string[] }
-    | undefined;
-  const imageModelConfig = cfg.agents?.defaults?.imageModel as
-    | { primary?: string; fallbacks?: string[] }
-    | undefined;
-
-  const primary = modelConfig?.primary?.trim() ?? "";
-  const fallbacks = modelConfig?.fallbacks ?? [];
-  const imagePrimary = imageModelConfig?.primary?.trim() ?? "";
-  const imageFallbacks = imageModelConfig?.fallbacks ?? [];
-
-  const addRaw = (raw: string) => {
-    const resolved = resolveModelRefFromString({
-      raw,
-      defaultProvider: DEFAULT_PROVIDER,
-      aliasIndex,
-    });
-    if (resolved) addRef(resolved.ref);
-  };
-
-  if (primary) addRaw(primary);
-  for (const raw of fallbacks) addRaw(String(raw ?? ""));
-  if (imagePrimary) addRaw(imagePrimary);
-  for (const raw of imageFallbacks) addRaw(String(raw ?? ""));
-
-  for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) {
-    const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER);
-    if (parsed) addRef(parsed);
-  }
-
-  return order;
-}
-
 describeLive("live models (profile keys)", () => {
  it(
-    "completes across configured models",
+    "completes across selected models",
    async () => {
      const cfg = loadConfig();
      await ensureClawdbotModelsJson(cfg);
+      if (!DIRECT_ENABLED) {
+        logProgress(
+          "[live-models] skipping (set CLAWDBOT_LIVE_MODELS=modern|all|<list>; all=modern)",
+        );
+        return;
+      }
+      const anthropicKeys = collectAnthropicApiKeys();
+      if (anthropicKeys.length > 0) {
+        process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
+        logProgress(
+          `[live-models] anthropic keys loaded: ${anthropicKeys.length}`,
+        );
+      }

      const agentDir = resolveClawdbotAgentDir();
      const authStorage = discoverAuthStorage(agentDir);
@@ -205,7 +155,11 @@ describeLive("live models (profile keys)", () => {
        models.map((model) => [`${model.provider}/${model.id}`, model]),
      );

-      const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
+      const rawModels = process.env.CLAWDBOT_LIVE_MODELS?.trim();
+      const useModern = rawModels === "modern" || rawModels === "all";
+      const useExplicit = Boolean(rawModels) && !useModern;
+      const filter = useExplicit ? parseModelFilter(rawModels) : null;
+      const allowNotFoundSkip = useModern;
      const providers = parseProviderFilter(
        process.env.CLAWDBOT_LIVE_PROVIDERS,
      );
@@ -216,149 +170,196 @@ describeLive("live models (profile keys)", () => {

      const failures: Array<{ model: string; error: string }> = [];
      const skipped: Array<{ model: string; reason: string }> = [];
+      const candidates: Array<{
+        model: Model<Api>;
+        apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
+      }> = [];

-      const configuredKeys = resolveConfiguredModelKeys(cfg);
-
-      for (const key of configuredKeys) {
-        const model = modelByKey.get(key);
-        if (!model) {
-          skipped.push({
-            model: key,
-            reason: "configured model missing in registry",
-          });
-          continue;
-        }
+      for (const model of models) {
        if (providers && !providers.has(model.provider)) continue;
        const id = `${model.provider}/${model.id}`;
        if (filter && !filter.has(id)) continue;
-
-        let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
-        try {
-          apiKeyInfo = await getApiKeyForModel({ model, cfg });
-        } catch (err) {
-          skipped.push({ model: id, reason: String(err) });
-          continue;
-        }
-
-        if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
-          skipped.push({
-            model: id,
-            reason: `non-profile credential source: ${apiKeyInfo.source}`,
-          });
-          continue;
-        }
-
-        try {
-          // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
-          if (
-            model.provider === "openai" &&
-            model.api === "openai-responses" &&
-            model.id === "gpt-5.2"
-          ) {
-            const noopTool = {
-              name: "noop",
-              description: "Return ok.",
-              parameters: Type.Object({}, { additionalProperties: false }),
-            };
-
-            const first = await completeSimpleWithTimeout(
-              model,
-              {
-                messages: [
-                  {
-                    role: "user",
-                    content:
-                      "Call the tool `noop` with {}. Do not write any other text.",
-                    timestamp: Date.now(),
-                  },
-                ],
-                tools: [noopTool],
-              },
-              {
-                apiKey: apiKeyInfo.apiKey,
-                reasoning: model.reasoning ? "low" : undefined,
-                maxTokens: 128,
-              },
-              perModelTimeoutMs,
-            );
-
-            const toolCall = first.content.find((b) => b.type === "toolCall");
-            expect(toolCall).toBeTruthy();
-            if (!toolCall || toolCall.type !== "toolCall") {
-              throw new Error("expected tool call");
-            }
-
-            const second = await completeSimpleWithTimeout(
-              model,
-              {
-                messages: [
-                  {
-                    role: "user",
-                    content:
-                      "Call the tool `noop` with {}. Do not write any other text.",
-                    timestamp: Date.now(),
-                  },
-                  first,
-                  {
-                    role: "toolResult",
-                    toolCallId: toolCall.id,
-                    toolName: "noop",
-                    content: [{ type: "text", text: "ok" }],
-                    isError: false,
-                    timestamp: Date.now(),
-                  },
-                  {
-                    role: "user",
-                    content: "Reply with the word ok.",
-                    timestamp: Date.now(),
-                  },
-                ],
-              },
-              {
-                apiKey: apiKeyInfo.apiKey,
-                reasoning: model.reasoning ? "low" : undefined,
-                maxTokens: 64,
-              },
-              perModelTimeoutMs,
-            );
-
-            const secondText = second.content
-              .filter((b) => b.type === "text")
-              .map((b) => b.text.trim())
-              .join(" ");
-            expect(secondText.length).toBeGreaterThan(0);
+        if (!filter && useModern) {
+          if (!isModernModelRef({ provider: model.provider, id: model.id })) {
            continue;
          }
-
-          const ok = await completeOkWithRetry({
-            model,
-            apiKey: apiKeyInfo.apiKey,
-            timeoutMs: perModelTimeoutMs,
-          });
-
-          if (ok.res.stopReason === "error") {
-            const msg = ok.res.errorMessage ?? "";
-            if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) {
-              skipped.push({ model: id, reason: msg });
-              continue;
-            }
-            throw new Error(msg || "model returned error with no message");
-          }
-
-          if (ok.text.length === 0 && model.provider === "google") {
+        }
+        try {
+          const apiKeyInfo = await getApiKeyForModel({ model, cfg });
+          if (
+            REQUIRE_PROFILE_KEYS &&
+            !apiKeyInfo.source.startsWith("profile:")
+          ) {
            skipped.push({
              model: id,
-              reason: "no text returned (likely unavailable model id)",
+              reason: `non-profile credential source: ${apiKeyInfo.source}`,
            });
            continue;
          }
-          expect(ok.text.length).toBeGreaterThan(0);
+          candidates.push({ model, apiKeyInfo });
        } catch (err) {
-          if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
-            skipped.push({ model: id, reason: String(err) });
-            continue;
+          skipped.push({ model: id, reason: String(err) });
+        }
+      }
+
+      if (candidates.length === 0) {
+        logProgress("[live-models] no API keys found; skipping");
+        return;
+      }
+
+      logProgress(
+        `[live-models] selection=${useExplicit ? "explicit" : "modern"}`,
+      );
+      logProgress(`[live-models] running ${candidates.length} models`);
+      const total = candidates.length;
+
+      for (const [index, entry] of candidates.entries()) {
+        const { model, apiKeyInfo } = entry;
+        const id = `${model.provider}/${model.id}`;
+        const progressLabel = `[live-models] ${index + 1}/${total} ${id}`;
+        const attemptMax =
+          model.provider === "anthropic" && anthropicKeys.length > 0
+            ? anthropicKeys.length
+            : 1;
+        for (let attempt = 0; attempt < attemptMax; attempt += 1) {
+          if (model.provider === "anthropic" && anthropicKeys.length > 0) {
+            process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
+          }
+          const apiKey =
+            model.provider === "anthropic" && anthropicKeys.length > 0
+              ? anthropicKeys[attempt]
+              : apiKeyInfo.apiKey;
+          try {
+            // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
+            if (
+              model.provider === "openai" &&
+              model.api === "openai-responses" &&
+              model.id === "gpt-5.2"
+            ) {
+              logProgress(`${progressLabel}: tool-only regression`);
+              const noopTool = {
+                name: "noop",
+                description: "Return ok.",
+                parameters: Type.Object({}, { additionalProperties: false }),
+              };
+
+              const first = await completeSimpleWithTimeout(
+                model,
+                {
+                  messages: [
+                    {
+                      role: "user",
+                      content:
+                        "Call the tool `noop` with {}. Do not write any other text.",
+                      timestamp: Date.now(),
+                    },
+                  ],
+                  tools: [noopTool],
+                },
+                {
+                  apiKey,
+                  reasoning: model.reasoning ? "low" : undefined,
+                  maxTokens: 128,
+                },
+                perModelTimeoutMs,
+              );
+
+              const toolCall = first.content.find((b) => b.type === "toolCall");
+              expect(toolCall).toBeTruthy();
+              if (!toolCall || toolCall.type !== "toolCall") {
+                throw new Error("expected tool call");
+              }
+
+              const second = await completeSimpleWithTimeout(
+                model,
+                {
+                  messages: [
+                    {
+                      role: "user",
+                      content:
+                        "Call the tool `noop` with {}. Do not write any other text.",
+                      timestamp: Date.now(),
+                    },
+                    first,
+                    {
+                      role: "toolResult",
+                      toolCallId: toolCall.id,
+                      toolName: "noop",
+                      content: [{ type: "text", text: "ok" }],
+                      isError: false,
+                      timestamp: Date.now(),
+                    },
+                    {
+                      role: "user",
+                      content: "Reply with the word ok.",
+                      timestamp: Date.now(),
+                    },
+                  ],
+                },
+                {
+                  apiKey,
+                  reasoning: model.reasoning ? "low" : undefined,
+                  maxTokens: 64,
+                },
+                perModelTimeoutMs,
+              );
+
+              const secondText = second.content
+                .filter((b) => b.type === "text")
+                .map((b) => b.text.trim())
+                .join(" ");
+              expect(secondText.length).toBeGreaterThan(0);
+              logProgress(`${progressLabel}: done`);
+              break;
+            }
+
+            logProgress(`${progressLabel}: prompt`);
+            const ok = await completeOkWithRetry({
+              model,
+              apiKey,
+              timeoutMs: perModelTimeoutMs,
+            });
+
+            if (ok.res.stopReason === "error") {
+              const msg = ok.res.errorMessage ?? "";
+              if (allowNotFoundSkip && isModelNotFoundErrorMessage(msg)) {
+                skipped.push({ model: id, reason: msg });
+                logProgress(`${progressLabel}: skip (model not found)`);
+                break;
+              }
+              throw new Error(msg || "model returned error with no message");
+            }
+
+            if (ok.text.length === 0 && model.provider === "google") {
+              skipped.push({
+                model: id,
+                reason: "no text returned (likely unavailable model id)",
+              });
+              logProgress(`${progressLabel}: skip (google model not found)`);
+              break;
+            }
+            expect(ok.text.length).toBeGreaterThan(0);
+            logProgress(`${progressLabel}: done`);
+            break;
+          } catch (err) {
+            const message = String(err);
+            if (
+              model.provider === "anthropic" &&
+              isAnthropicRateLimitError(message) &&
+              attempt + 1 < attemptMax
+            ) {
+              logProgress(`${progressLabel}: rate limit, retrying with next key`);
+              continue;
+            }
+            if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
+              skipped.push({ model: id, reason: message });
+              logProgress(`${progressLabel}: skip (google model not found)`);
+              break;
+            }
+            logProgress(`${progressLabel}: failed`);
+            failures.push({ model: id, error: message });
+            break;
          }
-          failures.push({ model: id, error: String(err) });
        }
      }

@@ -372,8 +373,6 @@ describeLive("live models (profile keys)", () => {
        );
      }

-      // Keep one assertion so the test fails loudly if we somehow ran nothing.
-      expect(models.length).toBeGreaterThan(0);
      void skipped;
    },
    15 * 60 * 1000,
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@@ -11,9 +11,15 @@ import {
 } from "@mariozechner/pi-coding-agent";
 import { describe, expect, it } from "vitest";
 import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
+import {
+  collectAnthropicApiKeys,
+  isAnthropicRateLimitError,
+} from "../agents/live-auth-keys.js";
+import { isModernModelRef } from "../agents/live-model-filter.js";
 import { getApiKeyForModel } from "../agents/model-auth.js";
 import { ensureClawdbotModelsJson } from "../agents/models-config.js";
 import { loadConfig } from "../config/config.js";
+import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js";
 import {
  GATEWAY_CLIENT_MODES,
  GATEWAY_CLIENT_NAMES,
@@ -25,16 +31,14 @@ import { startGatewayServer } from "./server.js";

 const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
 const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1";
-const ALL_MODELS =
-  process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" ||
-  process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all";
-const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1";
-const EXTRA_IMAGE_PROBES =
-  process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1";
 const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1";
 const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
+const THINKING_LEVEL = "high";
+const THINKING_TAG_RE =
+  /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
+const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;

-const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip;
+const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;

 function parseFilter(raw?: string): Set<string> | null {
  const trimmed = raw?.trim();
@@ -46,6 +50,26 @@ function parseFilter(raw?: string): Set<string> | null {
  return ids.length ? new Set(ids) : null;
 }

+function logProgress(message: string): void {
+  console.log(`[live] ${message}`);
+}
+
+function assertNoReasoningTags(params: {
+  text: string;
+  model: string;
+  phase: string;
+  label: string;
+}): void {
+  if (!params.text) return;
+  if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) {
+    const snippet =
+      params.text.length > 200 ? `${params.text.slice(0, 200)}…` : params.text;
+    throw new Error(
+      `[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`,
+    );
+  }
+}
+
 function extractPayloadText(result: unknown): string {
  const record = result as Record<string, unknown>;
  const payloads = Array.isArray(record.payloads) ? record.payloads : [];
@@ -200,61 +224,470 @@ async function connectClient(params: { url: string; token: string }) {
  });
 }

+type GatewayModelSuiteParams = {
+  label: string;
+  cfg: ClawdbotConfig;
+  candidates: Array<Model<Api>>;
+  extraToolProbes: boolean;
+  extraImageProbes: boolean;
+  thinkingLevel: string;
+  providerOverrides?: Record<string, ModelProviderConfig>;
+};
+
+function buildLiveGatewayConfig(params: {
+  cfg: ClawdbotConfig;
+  candidates: Array<Model<Api>>;
+  providerOverrides?: Record<string, ModelProviderConfig>;
+}): ClawdbotConfig {
+  const lmstudioProvider = params.cfg.models?.providers?.lmstudio;
+  const baseProviders = params.cfg.models?.providers ?? {};
+  const nextProviders = {
+    ...baseProviders,
+    ...(lmstudioProvider
+      ? {
+          lmstudio: {
+            ...lmstudioProvider,
+            api: "openai-completions",
+          },
+        }
+      : {}),
+    ...(params.providerOverrides ?? {}),
+  };
+  const providers =
+    Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders;
+  return {
+    ...params.cfg,
+    agents: {
+      ...params.cfg.agents,
+      list: (params.cfg.agents?.list ?? []).map((entry) => ({
+        ...entry,
+        sandbox: { mode: "off" },
+      })),
+      defaults: {
+        ...params.cfg.agents?.defaults,
+        // Live tests should avoid Docker sandboxing so tool probes can
+        // operate on the temporary probe files we create in the host workspace.
+        sandbox: { mode: "off" },
+        models: Object.fromEntries(
+          params.candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
+        ),
+      },
+    },
+    models:
+      Object.keys(providers).length > 0
+        ? { ...params.cfg.models, providers }
+        : params.cfg.models,
+  };
+}
+
+function buildMinimaxProviderOverride(params: {
+  cfg: ClawdbotConfig;
+  api: "openai-completions" | "anthropic-messages";
+  baseUrl: string;
+}): ModelProviderConfig | null {
+  const existing = params.cfg.models?.providers?.minimax;
+  if (!existing || !Array.isArray(existing.models) || existing.models.length === 0)
+    return null;
+  return {
+    ...existing,
+    api: params.api,
+    baseUrl: params.baseUrl,
+  };
+}
+
+async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
+  const previous = {
+    configPath: process.env.CLAWDBOT_CONFIG_PATH,
+    token: process.env.CLAWDBOT_GATEWAY_TOKEN,
+    skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
+    skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
+    skipCron: process.env.CLAWDBOT_SKIP_CRON,
+    skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
+  };
+
+  process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
+  process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
+  process.env.CLAWDBOT_SKIP_CRON = "1";
+  process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
+
+  const token = `test-${randomUUID()}`;
+  process.env.CLAWDBOT_GATEWAY_TOKEN = token;
+
+  const workspaceDir = resolveUserPath(
+    params.cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
+  );
+  await fs.mkdir(workspaceDir, { recursive: true });
+  const nonceA = randomUUID();
+  const nonceB = randomUUID();
+  const toolProbePath = path.join(
+    workspaceDir,
+    `.clawdbot-live-tool-probe.${nonceA}.txt`,
+  );
+  await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
+
+  const nextCfg = buildLiveGatewayConfig({
+    cfg: params.cfg,
+    candidates: params.candidates,
+    providerOverrides: params.providerOverrides,
+  });
+  const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-"));
+  const tempConfigPath = path.join(tempDir, "clawdbot.json");
+  await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`);
+  process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
+
+  await ensureClawdbotModelsJson(nextCfg);
+
+  const port = await getFreeGatewayPort();
+  const server = await startGatewayServer(port, {
+    bind: "loopback",
+    auth: { mode: "token", token },
+    controlUiEnabled: false,
+  });
+
+  const client = await connectClient({
+    url: `ws://127.0.0.1:${port}`,
+    token,
+  });
+
+  try {
+    logProgress(
+      `[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`,
+    );
+    const anthropicKeys = collectAnthropicApiKeys();
+    if (anthropicKeys.length > 0) {
+      process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
+      logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`);
+    }
+    const sessionKey = `agent:dev:${params.label}`;
+    const failures: Array<{ model: string; error: string }> = [];
+    const total = params.candidates.length;
+
+    for (const [index, model] of params.candidates.entries()) {
+      const modelKey = `${model.provider}/${model.id}`;
+      const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`;
+
+      const attemptMax =
+        model.provider === "anthropic" && anthropicKeys.length > 0
+          ? anthropicKeys.length
+          : 1;
+
+      for (let attempt = 0; attempt < attemptMax; attempt += 1) {
+        if (model.provider === "anthropic" && anthropicKeys.length > 0) {
+          process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
+        }
+        try {
+          // Ensure session exists + override model for this run.
+          await client.request<Record<string, unknown>>("sessions.patch", {
+            key: sessionKey,
+            model: modelKey,
+          });
+          // Reset between models: avoids cross-provider transcript incompatibilities
+          // (notably OpenAI Responses requiring reasoning replay for function_call items).
+          await client.request<Record<string, unknown>>("sessions.reset", {
+            key: sessionKey,
+          });
+
+        logProgress(`${progressLabel}: prompt`);
+        const runId = randomUUID();
+        const payload = await client.request<AgentFinalPayload>(
+          "agent",
+          {
+            sessionKey,
+            idempotencyKey: `idem-${runId}`,
+            message:
+              "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
+            thinking: params.thinkingLevel,
+            deliver: false,
+          },
+          { expectFinal: true },
+        );
+
+        if (payload?.status !== "ok") {
+          throw new Error(`agent status=${String(payload?.status)}`);
+        }
+        const text = extractPayloadText(payload?.result);
+        if (model.provider === "google" && isGoogleModelNotFoundText(text)) {
+          // Catalog drift: model IDs can disappear or become unavailable on the API.
+          // Treat as skip when scanning "all models" for Google.
+          logProgress(`${progressLabel}: skip (google model not found)`);
+          break;
+        }
+        assertNoReasoningTags({
+          text,
+          model: modelKey,
+          phase: "prompt",
+          label: params.label,
+        });
+        if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
+        if (
+          !/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
+          !/\bmacro\s*-?\s*tasks?\b/i.test(text)
+        ) {
+          throw new Error(`missing required keywords: ${text}`);
+        }
+
+        // Real tool invocation: force the agent to Read a local file and echo a nonce.
+        logProgress(`${progressLabel}: tool-read`);
+        const runIdTool = randomUUID();
+        const toolProbe = await client.request<AgentFinalPayload>(
+          "agent",
+          {
+            sessionKey,
+            idempotencyKey: `idem-${runIdTool}-tool`,
+            message:
+              "Clawdbot live tool probe (local, safe): " +
+              `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
+              "Then reply with the two nonce values you read (include both).",
+            thinking: params.thinkingLevel,
+            deliver: false,
+          },
+          { expectFinal: true },
+        );
+        if (toolProbe?.status !== "ok") {
+          throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
+        }
+        const toolText = extractPayloadText(toolProbe?.result);
+        assertNoReasoningTags({
+          text: toolText,
+          model: modelKey,
+          phase: "tool-read",
+          label: params.label,
+        });
+        if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
+          throw new Error(`tool probe missing nonce: ${toolText}`);
+        }
+
+        if (params.extraToolProbes) {
+          logProgress(`${progressLabel}: tool-exec`);
+          const nonceC = randomUUID();
+          const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
+
+          const execReadProbe = await client.request<AgentFinalPayload>(
+            "agent",
+            {
+              sessionKey,
+              idempotencyKey: `idem-${runIdTool}-exec-read`,
+              message:
+                "Clawdbot live tool probe (local, safe): " +
+                "use the tool named `exec` (or `Exec`) to run this command: " +
+                `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                "Finally reply including the nonce text you read back.",
+              thinking: params.thinkingLevel,
+              deliver: false,
+            },
+            { expectFinal: true },
+          );
+          if (execReadProbe?.status !== "ok") {
+            throw new Error(
+              `exec+read probe failed: status=${String(execReadProbe?.status)}`,
+            );
+          }
+          const execReadText = extractPayloadText(execReadProbe?.result);
+          assertNoReasoningTags({
+            text: execReadText,
+            model: modelKey,
+            phase: "tool-exec",
+            label: params.label,
+          });
+          if (!execReadText.includes(nonceC)) {
+            throw new Error(`exec+read probe missing nonce: ${execReadText}`);
+          }
+
+          await fs.rm(toolWritePath, { force: true });
+        }
+
+        if (params.extraImageProbes && model.input?.includes("image")) {
+          logProgress(`${progressLabel}: image`);
+          const imageCode = randomImageProbeCode(10);
+          const imageBase64 = renderCatNoncePngBase64(imageCode);
+          const runIdImage = randomUUID();
+
+          const imageProbe = await client.request<AgentFinalPayload>(
+            "agent",
+            {
+              sessionKey,
+              idempotencyKey: `idem-${runIdImage}-image`,
+              message:
+                "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
+                "(1) the animal shown or written in the image, lowercase; " +
+                "(2) the code printed in the image, uppercase. No extra text.",
+              attachments: [
+                {
+                  mimeType: "image/png",
+                  fileName: `probe-${runIdImage}.png`,
+                  content: imageBase64,
+                },
+              ],
+              thinking: params.thinkingLevel,
+              deliver: false,
+            },
+            { expectFinal: true },
+          );
+          if (imageProbe?.status !== "ok") {
+            throw new Error(
+              `image probe failed: status=${String(imageProbe?.status)}`,
+            );
+          }
+          const imageText = extractPayloadText(imageProbe?.result);
+          assertNoReasoningTags({
+            text: imageText,
+            model: modelKey,
+            phase: "image",
+            label: params.label,
+          });
+          if (!/\bcat\b/i.test(imageText)) {
+            throw new Error(`image probe missing 'cat': ${imageText}`);
+          }
+          const candidates =
+            imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
+          const bestDistance = candidates.reduce((best, cand) => {
+            if (Math.abs(cand.length - imageCode.length) > 2) return best;
+            return Math.min(best, editDistance(cand, imageCode));
+          }, Number.POSITIVE_INFINITY);
+          if (!(bestDistance <= 2)) {
+            throw new Error(
+              `image probe missing code (${imageCode}): ${imageText}`,
+            );
+          }
+        }
+
+        // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
+        if (
+          (model.provider === "openai" && model.api === "openai-responses") ||
+          (model.provider === "openai-codex" &&
+            model.api === "openai-codex-responses")
+        ) {
+          logProgress(`${progressLabel}: tool-only regression`);
+          const runId2 = randomUUID();
+          const first = await client.request<AgentFinalPayload>(
+            "agent",
+            {
+              sessionKey,
+              idempotencyKey: `idem-${runId2}-1`,
+              message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
+              thinking: params.thinkingLevel,
+              deliver: false,
+            },
+            { expectFinal: true },
+          );
+          if (first?.status !== "ok") {
+            throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
+          }
+          const firstText = extractPayloadText(first?.result);
+          assertNoReasoningTags({
+            text: firstText,
+            model: modelKey,
+            phase: "tool-only",
+            label: params.label,
+          });
+
+          const second = await client.request<AgentFinalPayload>(
+            "agent",
+            {
+              sessionKey,
+              idempotencyKey: `idem-${runId2}-2`,
+              message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
+              thinking: params.thinkingLevel,
+              deliver: false,
+            },
+            { expectFinal: true },
+          );
+          if (second?.status !== "ok") {
+            throw new Error(
+              `post-tool message failed: status=${String(second?.status)}`,
+            );
+          }
+          const reply = extractPayloadText(second?.result);
+          assertNoReasoningTags({
+            text: reply,
+            model: modelKey,
+            phase: "tool-only-followup",
+            label: params.label,
+          });
+          if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
+            throw new Error(`unexpected reply: ${reply}`);
+          }
+        }
+
+        logProgress(`${progressLabel}: done`);
+        break;
+        } catch (err) {
+          const message = String(err);
+          if (
+            model.provider === "anthropic" &&
+            isAnthropicRateLimitError(message) &&
+            attempt + 1 < attemptMax
+          ) {
+            logProgress(`${progressLabel}: rate limit, retrying with next key`);
+            continue;
+          }
+          // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
+          if (
+            model.provider === "openai-codex" &&
+            isRefreshTokenReused(message)
+          ) {
+            logProgress(`${progressLabel}: skip (codex refresh token reused)`);
+            break;
+          }
+          logProgress(`${progressLabel}: failed`);
+          failures.push({ model: modelKey, error: message });
+          break;
+        }
+      }
+    }
+
+    if (failures.length > 0) {
+      const preview = failures
+        .slice(0, 20)
+        .map((f) => `- ${f.model}: ${f.error}`)
+        .join("\n");
+      throw new Error(
+        `gateway live model failures (${failures.length}):\n${preview}`,
+      );
+    }
+  } finally {
+    client.stop();
+    await server.close({ reason: "live test complete" });
+    await fs.rm(toolProbePath, { force: true });
+    await fs.rm(tempDir, { recursive: true, force: true });
+
+    process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
+    process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
+    process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
+    process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
+    process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
+    process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
+  }
+}
+
 describeLive("gateway live (dev agent, profile keys)", () => {
  it(
    "runs meaningful prompts across models with available keys",
    async () => {
-      const previous = {
-        configPath: process.env.CLAWDBOT_CONFIG_PATH,
-        token: process.env.CLAWDBOT_GATEWAY_TOKEN,
-        skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
-        skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
-        skipCron: process.env.CLAWDBOT_SKIP_CRON,
-        skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
-      };
-
-      process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
-      process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
-      process.env.CLAWDBOT_SKIP_CRON = "1";
-      process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
-
-      const token = `test-${randomUUID()}`;
-      process.env.CLAWDBOT_GATEWAY_TOKEN = token;
-
      const cfg = loadConfig();
      await ensureClawdbotModelsJson(cfg);

-      const workspaceDir = resolveUserPath(
-        cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
-      );
-      await fs.mkdir(workspaceDir, { recursive: true });
-      const nonceA = randomUUID();
-      const nonceB = randomUUID();
-      const toolProbePath = path.join(
-        workspaceDir,
-        `.clawdbot-live-tool-probe.${nonceA}.txt`,
-      );
-      await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
-
      const agentDir = resolveClawdbotAgentDir();
      const authStorage = discoverAuthStorage(agentDir);
      const modelRegistry = discoverModels(authStorage, agentDir);
      const all = modelRegistry.getAll() as Array<Model<Api>>;

-      const filter = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_MODELS);
-
-      // Default: honor user allowlist. Opt-in: scan all models with keys.
-      const allowlistKeys = Object.keys(cfg.agents?.defaults?.models ?? {});
-      const wanted =
-        ALL_MODELS || allowlistKeys.length === 0
-          ? all
-          : all.filter((m) => allowlistKeys.includes(`${m.provider}/${m.id}`));
+      const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim();
+      const useModern =
+        !rawModels || rawModels === "modern" || rawModels === "all";
+      const useExplicit = Boolean(rawModels) && !useModern;
+      const filter = useExplicit ? parseFilter(rawModels) : null;
+      const wanted = filter
+        ? all.filter((m) => filter.has(`${m.provider}/${m.id}`))
+        : all.filter((m) =>
+            isModernModelRef({ provider: m.provider, id: m.id }),
+          );

      const candidates: Array<Model<Api>> = [];
      for (const model of wanted) {
        const id = `${model.provider}/${model.id}`;
        if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
-        if (filter && !filter.has(id)) continue;
        try {
          // eslint-disable-next-line no-await-in-loop
          await getApiKeyForModel({ model, cfg });
@@ -264,315 +697,72 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        }
      }

-      expect(candidates.length).toBeGreaterThan(0);
-      const imageCandidates = EXTRA_IMAGE_PROBES
-        ? candidates.filter((m) => m.input?.includes("image"))
-        : [];
-      if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) {
-        throw new Error(
-          "image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model",
+      if (candidates.length === 0) {
+        logProgress("[all-models] no API keys found; skipping");
+        return;
+      }
+      logProgress(
+        `[all-models] selection=${useExplicit ? "explicit" : "modern"}`,
+      );
+      const imageCandidates = candidates.filter((m) =>
+        m.input?.includes("image"),
+      );
+      if (imageCandidates.length === 0) {
+        logProgress(
+          "[all-models] no image-capable models selected; image probe will be skipped",
        );
      }
-
-      // Build a temp config that allows all selected models, so session overrides stick.
-      const lmstudioProvider = cfg.models?.providers?.lmstudio;
-      const nextCfg = {
-        ...cfg,
-        agents: {
-          ...cfg.agents,
-          list: (cfg.agents?.list ?? []).map((entry) => ({
-            ...entry,
-            sandbox: { mode: "off" },
-          })),
-          defaults: {
-            ...cfg.agents?.defaults,
-            // Live tests should avoid Docker sandboxing so tool probes can
-            // operate on the temporary probe files we create in the host workspace.
-            sandbox: { mode: "off" },
-            models: Object.fromEntries(
-              candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
-            ),
-          },
-        },
-        models: {
-          ...cfg.models,
-          providers: {
-            ...cfg.models?.providers,
-            // LM Studio is most reliable via Chat Completions; its Responses API
-            // tool-calling behavior is inconsistent across releases.
-            ...(lmstudioProvider
-              ? {
-                  lmstudio: {
-                    ...lmstudioProvider,
-                    api: "openai-completions",
-                  },
-                }
-              : {}),
-          },
-        },
-      };
-      const tempDir = await fs.mkdtemp(
-        path.join(os.tmpdir(), "clawdbot-live-"),
-      );
-      const tempConfigPath = path.join(tempDir, "clawdbot.json");
-      await fs.writeFile(
-        tempConfigPath,
-        `${JSON.stringify(nextCfg, null, 2)}\n`,
-      );
-      process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
-
-      const port = await getFreeGatewayPort();
-      const server = await startGatewayServer(port, {
-        bind: "loopback",
-        auth: { mode: "token", token },
-        controlUiEnabled: false,
+      await runGatewayModelSuite({
+        label: "all-models",
+        cfg,
+        candidates,
+        extraToolProbes: true,
+        extraImageProbes: true,
+        thinkingLevel: THINKING_LEVEL,
      });

-      const client = await connectClient({
-        url: `ws://127.0.0.1:${port}`,
-        token,
+      const minimaxCandidates = candidates.filter((model) => model.provider === "minimax");
+      if (minimaxCandidates.length === 0) {
+        logProgress("[minimax] no candidates with keys; skipping dual endpoint probes");
+        return;
+      }
+
+      const minimaxOpenAi = buildMinimaxProviderOverride({
+        cfg,
+        api: "openai-completions",
+        baseUrl: "https://api.minimax.io/v1",
      });
+      if (minimaxOpenAi) {
+        await runGatewayModelSuite({
+          label: "minimax-openai",
+          cfg,
+          candidates: minimaxCandidates,
+          extraToolProbes: true,
+          extraImageProbes: true,
+          thinkingLevel: THINKING_LEVEL,
+          providerOverrides: { minimax: minimaxOpenAi },
+        });
+      } else {
+        logProgress("[minimax-openai] missing minimax provider config; skipping");
+      }

-      try {
-        const sessionKey = "agent:dev:live-gateway";
-
-        const failures: Array<{ model: string; error: string }> = [];
-
-        for (const model of candidates) {
-          const modelKey = `${model.provider}/${model.id}`;
-
-          try {
-            // Ensure session exists + override model for this run.
-            await client.request<Record<string, unknown>>("sessions.patch", {
-              key: sessionKey,
-              model: modelKey,
-            });
-            // Reset between models: avoids cross-provider transcript incompatibilities
-            // (notably OpenAI Responses requiring reasoning replay for function_call items).
-            await client.request<Record<string, unknown>>("sessions.reset", {
-              key: sessionKey,
-            });
-
-            // “Meaningful” direct prompt (no tools).
-            const runId = randomUUID();
-            const payload = await client.request<AgentFinalPayload>(
-              "agent",
-              {
-                sessionKey,
-                idempotencyKey: `idem-${runId}`,
-                message:
-                  "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
-                deliver: false,
-              },
-              { expectFinal: true },
-            );
-
-            if (payload?.status !== "ok") {
-              throw new Error(`agent status=${String(payload?.status)}`);
-            }
-            const text = extractPayloadText(payload?.result);
-            if (
-              model.provider === "google" &&
-              isGoogleModelNotFoundText(text)
-            ) {
-              // Catalog drift: model IDs can disappear or become unavailable on the API.
-              // Treat as skip when scanning "all models" for Google.
-              continue;
-            }
-            if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
-            if (
-              !/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
-              !/\bmacro\s*-?\s*tasks?\b/i.test(text)
-            ) {
-              throw new Error(`missing required keywords: ${text}`);
-            }
-
-            // Real tool invocation: force the agent to Read a local file and echo a nonce.
-            const runIdTool = randomUUID();
-            const toolProbe = await client.request<AgentFinalPayload>(
-              "agent",
-              {
-                sessionKey,
-                idempotencyKey: `idem-${runIdTool}-tool`,
-                message:
-                  "Clawdbot live tool probe (local, safe): " +
-                  `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
-                  "Then reply with the two nonce values you read (include both).",
-                deliver: false,
-              },
-              { expectFinal: true },
-            );
-            if (toolProbe?.status !== "ok") {
-              throw new Error(
-                `tool probe failed: status=${String(toolProbe?.status)}`,
-              );
-            }
-            const toolText = extractPayloadText(toolProbe?.result);
-            if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
-              throw new Error(`tool probe missing nonce: ${toolText}`);
-            }
-
-            if (EXTRA_TOOL_PROBES) {
-              const nonceC = randomUUID();
-              const toolWritePath = path.join(
-                tempDir,
-                `write-${runIdTool}.txt`,
-              );
-
-              const execReadProbe = await client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runIdTool}-exec-read`,
-                  message:
-                    "Clawdbot live tool probe (local, safe): " +
-                    "use the tool named `exec` (or `Exec`) to run this command: " +
-                    `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
-                    `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
-                    "Finally reply including the nonce text you read back.",
-                  deliver: false,
-                },
-                { expectFinal: true },
-              );
-              if (execReadProbe?.status !== "ok") {
-                throw new Error(
-                  `exec+read probe failed: status=${String(execReadProbe?.status)}`,
-                );
-              }
-              const execReadText = extractPayloadText(execReadProbe?.result);
-              if (!execReadText.includes(nonceC)) {
-                throw new Error(
-                  `exec+read probe missing nonce: ${execReadText}`,
-                );
-              }
-
-              await fs.rm(toolWritePath, { force: true });
-            }
-
-            if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) {
-              const imageCode = randomImageProbeCode(10);
-              const imageBase64 = renderCatNoncePngBase64(imageCode);
-              const runIdImage = randomUUID();
-
-              const imageProbe = await client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runIdImage}-image`,
-                  message:
-                    "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
-                    "(1) the animal shown or written in the image, lowercase; " +
-                    "(2) the code printed in the image, uppercase. No extra text.",
-                  attachments: [
-                    {
-                      mimeType: "image/png",
-                      fileName: `probe-${runIdImage}.png`,
-                      content: imageBase64,
-                    },
-                  ],
-                  deliver: false,
-                },
-                { expectFinal: true },
-              );
-              if (imageProbe?.status !== "ok") {
-                throw new Error(
-                  `image probe failed: status=${String(imageProbe?.status)}`,
-                );
-              }
-              const imageText = extractPayloadText(imageProbe?.result);
-              if (!/\bcat\b/i.test(imageText)) {
-                throw new Error(`image probe missing 'cat': ${imageText}`);
-              }
-              const candidates =
-                imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
-              const bestDistance = candidates.reduce((best, cand) => {
-                if (Math.abs(cand.length - imageCode.length) > 2) return best;
-                return Math.min(best, editDistance(cand, imageCode));
-              }, Number.POSITIVE_INFINITY);
-              if (!(bestDistance <= 2)) {
-                throw new Error(
-                  `image probe missing code (${imageCode}): ${imageText}`,
-                );
-              }
-            }
-            // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
-            if (
-              (model.provider === "openai" &&
-                model.api === "openai-responses") ||
-              (model.provider === "openai-codex" &&
-                model.api === "openai-codex-responses")
-            ) {
-              const runId2 = randomUUID();
-              const first = await client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runId2}-1`,
-                  message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              );
-              if (first?.status !== "ok") {
-                throw new Error(
-                  `tool-only turn failed: status=${String(first?.status)}`,
-                );
-              }
-
-              const second = await client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runId2}-2`,
-                  message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              );
-              if (second?.status !== "ok") {
-                throw new Error(
-                  `post-tool message failed: status=${String(second?.status)}`,
-                );
-              }
-              const reply = extractPayloadText(second?.result);
-              if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
-                throw new Error(`unexpected reply: ${reply}`);
-              }
-            }
-          } catch (err) {
-            const message = String(err);
-            // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
-            if (
-              model.provider === "openai-codex" &&
-              isRefreshTokenReused(message)
-            ) {
-              continue;
-            }
-            failures.push({ model: modelKey, error: message });
-          }
-        }
-
-        if (failures.length > 0) {
-          const preview = failures
-            .slice(0, 20)
-            .map((f) => `- ${f.model}: ${f.error}`)
-            .join("\n");
-          throw new Error(
-            `gateway live model failures (${failures.length}):\n${preview}`,
-          );
-        }
-      } finally {
-        client.stop();
-        await server.close({ reason: "live test complete" });
-        await fs.rm(toolProbePath, { force: true });
-        await fs.rm(tempDir, { recursive: true, force: true });
-
-        process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
-        process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
-        process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
-        process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
-        process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
-        process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
+      const minimaxAnthropic = buildMinimaxProviderOverride({
+        cfg,
+        api: "anthropic-messages",
+        baseUrl: "https://api.minimax.io/anthropic",
+      });
+      if (minimaxAnthropic) {
+        await runGatewayModelSuite({
+          label: "minimax-anthropic",
+          cfg,
+          candidates: minimaxCandidates,
+          extraToolProbes: true,
+          extraImageProbes: true,
+          thinkingLevel: THINKING_LEVEL,
+          providerOverrides: { minimax: minimaxAnthropic },
+        });
+      } else {
+        logProgress("[minimax-anthropic] missing minimax provider config; skipping");
      }
    },
    20 * 60 * 1000,
@@ -661,6 +851,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
          message:
            `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
            `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
+          thinking: THINKING_LEVEL,
          deliver: false,
        },
        { expectFinal: true },
@@ -671,6 +862,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        );
      }
      const toolText = extractPayloadText(toolProbe?.result);
+      assertNoReasoningTags({
+        text: toolText,
+        model: "anthropic/claude-opus-4-5",
+        phase: "zai-fallback-tool",
+        label: "zai-fallback",
+      });
      if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
        throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
      }
@@ -689,6 +886,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
          message:
            `What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
            `Reply with exactly: ${nonceA} ${nonceB}.`,
+          thinking: THINKING_LEVEL,
          deliver: false,
        },
        { expectFinal: true },
@@ -699,6 +897,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        );
      }
      const followupText = extractPayloadText(followup?.result);
+      assertNoReasoningTags({
+        text: followupText,
+        model: "zai/glm-4.7",
+        phase: "zai-fallback-followup",
+        label: "zai-fallback",
+      });
      if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
        throw new Error(`zai followup missing nonce: ${followupText}`);
      }