fix(testing): stabilize live model runs

2026-01-11 04:21:24 +00:00
parent 3b6739d3e9
commit f00038b383
4 changed files with 169 additions and 47 deletions
--- a/patches/@mariozechner__pi-ai@0.42.2.patch
+++ b/patches/@mariozechner__pi-ai@0.42.2.patch
@@ -27,19 +27,3 @@ index 188a8294f26fe1bfe3fb298a7f58e4d8eaf2a529..a3aeb6a7ff53bc4f7f44362adb950b2c
     }));
 }
 function mapStopReason(status) {
-diff --git a/dist/providers/openai-responses.js b/dist/providers/openai-responses.js
-index 7b58a79c989abc76bb8fc9e99fb49126e5fd7de4..a1a7f35ad47975dc1268d1a0c2078b0b651e97b4 100644
--- a/dist/providers/openai-responses.js
-+++ b/dist/providers/openai-responses.js
-@@ -396,9 +396,10 @@ function convertMessages(model, context) {
-         }
-         else if (msg.role === "assistant") {
-             const output = [];
-+            const hasAssistantText = msg.content.some((block) => block.type === "text");
-             for (const block of msg.content) {
-                 // Do not submit thinking blocks if the completion had an error (i.e. abort)
-                if (block.type === "thinking" && msg.stopReason !== "error") {
-+                if (block.type === "thinking" && msg.stopReason !== "error" && hasAssistantText) {
-                     if (block.thinkingSignature) {
-                         const reasoningItem = JSON.parse(block.thinkingSignature);
-                         output.push(reasoningItem);
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@@ -7,7 +7,14 @@ import { Type } from "@sinclair/typebox";
 import { describe, expect, it } from "vitest";
 import { loadConfig } from "../config/config.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";
+import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import { getApiKeyForModel } from "./model-auth.js";
+import {
+  buildModelAliasIndex,
+  parseModelRef,
+  resolveConfiguredModelRef,
+  resolveModelRefFromString,
+} from "./model-selection.js";
 import { ensureClawdbotModelsJson } from "./models-config.js";

 const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
@@ -58,6 +65,131 @@ function isModelNotFoundErrorMessage(raw: string): boolean {
  return false;
 }

+function toInt(value: string | undefined, fallback: number): number {
+  const trimmed = value?.trim();
+  if (!trimmed) return fallback;
+  const parsed = Number.parseInt(trimmed, 10);
+  return Number.isFinite(parsed) ? parsed : fallback;
+}
+
+async function completeSimpleWithTimeout<TApi extends Api>(
+  model: Model<TApi>,
+  context: Parameters<typeof completeSimple<TApi>>[1],
+  options: Parameters<typeof completeSimple<TApi>>[2],
+  timeoutMs: number,
+) {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
+  timer.unref?.();
+  try {
+    return await completeSimple(model, context, {
+      ...options,
+      signal: controller.signal,
+    });
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+async function completeOkWithRetry(params: {
+  model: Model<Api>;
+  apiKey: string;
+  timeoutMs: number;
+}) {
+  const runOnce = async () => {
+    const res = await completeSimpleWithTimeout(
+      params.model,
+      {
+        messages: [
+          {
+            role: "user",
+            content: "Reply with the word ok.",
+            timestamp: Date.now(),
+          },
+        ],
+      },
+      {
+        apiKey: params.apiKey,
+        reasoning: params.model.reasoning ? "low" : undefined,
+        maxTokens: 64,
+      },
+      params.timeoutMs,
+    );
+    const text = res.content
+      .filter((block) => block.type === "text")
+      .map((block) => block.text.trim())
+      .join(" ");
+    return { res, text };
+  };
+
+  const first = await runOnce();
+  if (first.text.length > 0) return first;
+  return await runOnce();
+}
+
+function resolveConfiguredModelKeys(
+  cfg: ReturnType<typeof loadConfig>,
+): string[] {
+  const aliasIndex = buildModelAliasIndex({
+    cfg,
+    defaultProvider: DEFAULT_PROVIDER,
+  });
+  const order: string[] = [];
+  const seen = new Set<string>();
+
+  const addKey = (key: string) => {
+    const normalized = key.trim();
+    if (!normalized || seen.has(normalized)) return;
+    seen.add(normalized);
+    order.push(normalized);
+  };
+
+  const addRef = (ref: { provider: string; model: string }) => {
+    addKey(`${ref.provider}/${ref.model}`);
+  };
+
+  addRef(
+    resolveConfiguredModelRef({
+      cfg,
+      defaultProvider: DEFAULT_PROVIDER,
+      defaultModel: DEFAULT_MODEL,
+    }),
+  );
+
+  const modelConfig = cfg.agents?.defaults?.model as
+    | { primary?: string; fallbacks?: string[] }
+    | undefined;
+  const imageModelConfig = cfg.agents?.defaults?.imageModel as
+    | { primary?: string; fallbacks?: string[] }
+    | undefined;
+
+  const primary = modelConfig?.primary?.trim() ?? "";
+  const fallbacks = modelConfig?.fallbacks ?? [];
+  const imagePrimary = imageModelConfig?.primary?.trim() ?? "";
+  const imageFallbacks = imageModelConfig?.fallbacks ?? [];
+
+  const addRaw = (raw: string) => {
+    const resolved = resolveModelRefFromString({
+      raw,
+      defaultProvider: DEFAULT_PROVIDER,
+      aliasIndex,
+    });
+    if (resolved) addRef(resolved.ref);
+  };
+
+  if (primary) addRaw(primary);
+  for (const raw of fallbacks) addRaw(String(raw ?? ""));
+  if (imagePrimary) addRaw(imagePrimary);
+  for (const raw of imageFallbacks) addRaw(String(raw ?? ""));
+
+  for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) {
+    const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER);
+    if (parsed) addRef(parsed);
+  }
+
+  return order;
+}
+
 describeLive("live models (profile keys)", () => {
  it(
    "completes across configured models",
@@ -69,16 +201,33 @@ describeLive("live models (profile keys)", () => {
      const authStorage = discoverAuthStorage(agentDir);
      const modelRegistry = discoverModels(authStorage, agentDir);
      const models = modelRegistry.getAll() as Array<Model<Api>>;
+      const modelByKey = new Map(
+        models.map((model) => [`${model.provider}/${model.id}`, model]),
+      );

      const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
      const providers = parseProviderFilter(
        process.env.CLAWDBOT_LIVE_PROVIDERS,
      );
+      const perModelTimeoutMs = toInt(
+        process.env.CLAWDBOT_LIVE_MODEL_TIMEOUT_MS,
+        30_000,
+      );

      const failures: Array<{ model: string; error: string }> = [];
      const skipped: Array<{ model: string; reason: string }> = [];

-      for (const model of models) {
+      const configuredKeys = resolveConfiguredModelKeys(cfg);
+
+      for (const key of configuredKeys) {
+        const model = modelByKey.get(key);
+        if (!model) {
+          skipped.push({
+            model: key,
+            reason: "configured model missing in registry",
+          });
+          continue;
+        }
        if (providers && !providers.has(model.provider)) continue;
        const id = `${model.provider}/${model.id}`;
        if (filter && !filter.has(id)) continue;
@@ -100,7 +249,7 @@ describeLive("live models (profile keys)", () => {
        }

        try {
-          // Special regression: OpenAI rejects replayed `reasoning` items for tool-only turns.
+          // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
          if (
            model.provider === "openai" &&
            model.api === "openai-responses" &&
@@ -112,7 +261,7 @@ describeLive("live models (profile keys)", () => {
              parameters: Type.Object({}, { additionalProperties: false }),
            };

-            const first = await completeSimple(
+            const first = await completeSimpleWithTimeout(
              model,
              {
                messages: [
@@ -130,6 +279,7 @@ describeLive("live models (profile keys)", () => {
                reasoning: model.reasoning ? "low" : undefined,
                maxTokens: 128,
              },
+              perModelTimeoutMs,
            );

            const toolCall = first.content.find((b) => b.type === "toolCall");
@@ -138,7 +288,7 @@ describeLive("live models (profile keys)", () => {
              throw new Error("expected tool call");
            }

-            const second = await completeSimple(
+            const second = await completeSimpleWithTimeout(
              model,
              {
                messages: [
@@ -169,6 +319,7 @@ describeLive("live models (profile keys)", () => {
                reasoning: model.reasoning ? "low" : undefined,
                maxTokens: 64,
              },
+              perModelTimeoutMs,
            );

            const secondText = second.content
@@ -179,26 +330,14 @@ describeLive("live models (profile keys)", () => {
            continue;
          }

-          const res = await completeSimple(
+          const ok = await completeOkWithRetry({
            model,
-            {
-              messages: [
-                {
-                  role: "user",
-                  content: "Reply with the word ok.",
-                  timestamp: Date.now(),
-                },
-              ],
-            },
-            {
-              apiKey: apiKeyInfo.apiKey,
-              reasoning: model.reasoning ? "low" : undefined,
-              maxTokens: 64,
-            },
-          );
+            apiKey: apiKeyInfo.apiKey,
+            timeoutMs: perModelTimeoutMs,
+          });

-          if (res.stopReason === "error") {
-            const msg = res.errorMessage ?? "";
+          if (ok.res.stopReason === "error") {
+            const msg = ok.res.errorMessage ?? "";
            if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) {
              skipped.push({ model: id, reason: msg });
              continue;
@@ -206,18 +345,14 @@ describeLive("live models (profile keys)", () => {
            throw new Error(msg || "model returned error with no message");
          }

-          const text = res.content
-            .filter((block) => block.type === "text")
-            .map((block) => block.text.trim())
-            .join(" ");
-          if (text.length === 0 && model.provider === "google") {
+          if (ok.text.length === 0 && model.provider === "google") {
            skipped.push({
              model: id,
              reason: "no text returned (likely unavailable model id)",
            });
            continue;
          }
-          expect(text.length).toBeGreaterThan(0);
+          expect(ok.text.length).toBeGreaterThan(0);
        } catch (err) {
          if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
            skipped.push({ model: id, reason: String(err) });
--- a/src/agents/openai-responses.reasoning-replay.test.ts
+++ b/src/agents/openai-responses.reasoning-replay.test.ts
@@ -52,7 +52,7 @@ function installFailingFetchCapture() {
 }

 describe("openai-responses reasoning replay", () => {
-  it("skips reasoning for tool-call-only turns (OpenAI rejects standalone reasoning)", async () => {
+  it("replays reasoning for tool-call-only turns (OpenAI requires it)", async () => {
    const cap = installFailingFetchCapture();
    try {
      const model = buildModel();
@@ -141,8 +141,11 @@ describe("openai-responses reasoning replay", () => {
        )
        .filter((t): t is string => typeof t === "string");

+      expect(types).toContain("reasoning");
      expect(types).toContain("function_call");
-      expect(types).not.toContain("reasoning");
+      expect(types.indexOf("reasoning")).toBeLessThan(
+        types.indexOf("function_call"),
+      );
    } finally {
      cap.restore();
    }
--- a/src/commands/doctor-sandbox.ts
+++ b/src/commands/doctor-sandbox.ts
@@ -290,7 +290,7 @@ export function noteSandboxScopeWarnings(cfg: ClawdbotConfig) {
    warnings.push(
      `- agents.list (id "${agentId}") sandbox ${overrides.join(
        "/",
-      )} overrides ignored (scope resolves to "shared").`,
+      )} overrides ignored\n  scope resolves to "shared".`,
    );
  }