feat(macos): add Canvas A2UI renderer

2025-12-17 11:35:06 +01:00
parent 1cdebb68a0
commit cdb5ddb2da
408 changed files with 73598 additions and 32 deletions
--- a/vendor/a2ui/specification/0.9/eval/src/ai.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/ai.ts
@@ -0,0 +1,46 @@
+
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { googleAI } from "@genkit-ai/google-genai";
+import { genkit } from "genkit";
+import { openAI } from "@genkit-ai/compat-oai/openai";
+import { anthropic } from "genkitx-anthropic";
+import { logger } from "./logger";
+
+const plugins = [];
+
+if (process.env.GEMINI_API_KEY) {
+  logger.info("Initializing Google AI plugin...");
+  plugins.push(
+    googleAI({
+      apiKey: process.env.GEMINI_API_KEY!,
+      experimental_debugTraces: true,
+    })
+  );
+}
+if (process.env.OPENAI_API_KEY) {
+  logger.info("Initializing OpenAI plugin...");
+  plugins.push(openAI());
+}
+if (process.env.ANTHROPIC_API_KEY) {
+  logger.info("Initializing Anthropic plugin...");
+  plugins.push(anthropic({ apiKey: process.env.ANTHROPIC_API_KEY! }));
+}
+
+export const ai = genkit({
+  plugins,
+});
--- a/vendor/a2ui/specification/0.9/eval/src/analysis_flow.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/analysis_flow.ts
@@ -0,0 +1,118 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { z } from "genkit";
+import { ai } from "./ai";
+import { rateLimiter } from "./rateLimiter";
+import { logger } from "./logger";
+
+export const analysisFlow = ai.defineFlow(
+  {
+    name: "analysisFlow",
+    inputSchema: z.object({
+      modelName: z.string(),
+      failures: z.array(
+        z.object({
+          promptName: z.string(),
+          runNumber: z.number(),
+          failureType: z.string(),
+          reason: z.string(),
+          issues: z.array(z.string()).optional(),
+        })
+      ),
+      numRuns: z.number(),
+      evalModel: z.string(),
+    }),
+    outputSchema: z.string(),
+  },
+  async ({ modelName, failures, numRuns, evalModel }) => {
+    const failureDetails = failures
+      .map((f) => {
+        let details = `Prompt: ${f.promptName} (Run ${f.runNumber})\nType: ${f.failureType}\nReason: ${f.reason}`;
+        if (f.issues && f.issues.length > 0) {
+          details += `\nIssues:\n- ${f.issues.join("\n- ")}`;
+        }
+        return details;
+      })
+      .join("\n\n---\n\n");
+
+    const analysisPrompt = `You are an expert AI analyst.
+Your task is to analyze the following failures from an evaluation run of the model "${modelName}".
+
+Out of the ${failures.length} failures, ${failures.filter((f) => f.failureType === "Schema Validation").length} are schema validation failures, ${failures.filter((f) => f.failureType === "Missing Components").length} are missing components failures, and ${failures.filter((f) => f.failureType === "Incorrect Logic").length} are incorrect logic failures.
+
+There were ${numRuns - failures.length} successful runs. Take this into account in the final summary of the analysis.
+
+Failures:
+${failureDetails}
+
+Instructions:
+1. Identify and list the broad types of errors (e.g., Schema Validation, Missing Components, Incorrect Logic, etc.).
+2. Analyze succinctly any patterns you see in the failures (e.g., "The model consistently fails to include the 'id' property", "The model struggles with nested layouts") and list them in a bullet point list. Try to give short examples of the patterns taken from the actual failures.
+3. Provide a concise summary of your findings in a single paragraph.
+
+The output is meant to be a short summary, not a full report. It should be easy to read and understand at a glance.
+
+Output Format:
+Return a Markdown formatted summary. Use headers and bullet points.
+`;
+
+    // Calculate estimated tokens for rate limiting
+    const estimatedInputTokens = Math.ceil(analysisPrompt.length / 2.5);
+
+    const { modelsToTest } = await import("./models");
+    let evalModelConfig = modelsToTest.find((m) => m.name === evalModel);
+
+    if (!evalModelConfig) {
+      evalModelConfig = {
+        name: evalModel,
+        model: null,
+        requestsPerMinute: 60,
+        tokensPerMinute: 100000,
+      };
+    }
+
+    await rateLimiter.acquirePermit(evalModelConfig, estimatedInputTokens);
+
+    try {
+      const response = await ai.generate({
+        prompt: analysisPrompt,
+        model: evalModelConfig.model || evalModel,
+        config: evalModelConfig.config,
+        output: {
+          format: "text",
+        },
+      });
+
+      const output = response.output;
+      if (!output) {
+        throw new Error("No output from analysis model");
+      }
+
+      if (typeof output !== "string") {
+        return "Analysis failed: Output was not a string.";
+      }
+
+      return output;
+    } catch (e: any) {
+      logger.error(`Error during analysis: ${e}`);
+      if (evalModelConfig) {
+        rateLimiter.reportError(evalModelConfig, e);
+      }
+      return `Analysis failed: ${e.message}`;
+    }
+  }
+);
--- a/vendor/a2ui/specification/0.9/eval/src/dev.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/dev.ts
@@ -0,0 +1,18 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import "./generation_flow";
+import "./evaluation_flow";
--- a/vendor/a2ui/specification/0.9/eval/src/evaluation_flow.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/evaluation_flow.ts
@@ -0,0 +1,193 @@
+
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { z } from "genkit";
+import { ai } from "./ai";
+import { rateLimiter } from "./rateLimiter";
+import { logger } from "./logger";
+import * as yaml from "js-yaml";
+
+// Define an evaluation flow
+export const evaluationFlow = ai.defineFlow(
+  {
+    name: "evaluationFlow",
+    inputSchema: z.object({
+      originalPrompt: z.string(),
+      generatedOutput: z.string(),
+      evalModel: z.string(),
+      schemas: z.any(),
+    }),
+    outputSchema: z.object({
+      pass: z.boolean(),
+      reason: z.string(),
+      issues: z
+        .array(
+          z.object({
+            issue: z.string(),
+            severity: z.enum(["minor", "significant", "critical"]),
+          })
+        )
+        .optional(),
+      evalPrompt: z.string().optional(),
+    }),
+  },
+  async ({ originalPrompt, generatedOutput, evalModel, schemas }) => {
+    const schemaDefs = Object.values(schemas)
+      .map((s: any) => JSON.stringify(s, null, 2))
+      .join("\n\n");
+
+    const EvalResultSchema = z.object({
+      pass: z
+        .boolean()
+        .describe("Whether the generated UI meets the requirements"),
+      reason: z.string().describe("Summary of the reason for a failure."),
+      issues: z
+        .array(
+          z.object({
+            issue: z.string().describe("Description of the issue"),
+            severity: z
+              .enum(["minor", "significant", "critical"])
+              .describe("Severity of the issue"),
+          })
+        )
+        .describe("List of specific issues found."),
+    });
+
+    const evalPrompt = `You are an expert QA evaluator for a UI generation system.
+Your task is to evaluate whether the generated UI JSON matches the user's request and conforms to the expected behavior.
+
+User Request:
+${originalPrompt}
+
+Expected Schemas:
+${schemaDefs}
+
+Generated Output (JSONL in Markdown):
+${generatedOutput}
+
+Instructions:
+1. Analyze the Generated Output against the User Request.
+2. Check if all requested components are present and match the user's intent.
+3. Check if the hierarchy and properties match the description.
+4. Verify that the content (text, labels, etc.) is correct and makes sense.
+5. Ignore minor formatting differences.
+6. If the output is correct and satisfies the request, return "pass": true.
+7. If there are missing components, incorrect values, or structural issues that affect the user experience, return "pass": false and provide a detailed "reason".
+8. In the "reason", explicitly quote the part of the JSON that is incorrect if possible.
+
+- You can be lenient in your evaluation for URLs, as the generated output may use a placeholder URL for images and icons.
+- If label text is similar but not exact, you can still pass the test as long as the meaning is the same. (e.g. "Cancel" vs "Cancel Order")
+- If the generated output is missing a component that is specified in the user request, it is required to exist in the output in order to pass the test. If it is not specified, it is not required.
+- If the request is vague about the contents of a label or other property, you can still pass the test as long as it can be construed as matching the intent.
+- Unless explicitly required to be absent by the user request, extra components or attributes are allowed.
+
+Severity Definitions:
+- Minor: Merely cosmetic or a slight deviation from the request.
+- Significant: The UI isn't very ergonomic or would be hard to understand.
+- Critical: That part of the UI is left off, or the structure isn't valid and can't be rendered.
+
+Return a JSON object with the following schema:
+
+\`\`\`json
+{
+  "type": "object",
+  "properties": {
+    "pass": {
+      "type": "boolean",
+      "description": "Whether the generated UI meets the requirements"
+    },
+    "reason": {
+      "type": "string",
+      "description": "Summary of the reason for a failure."
+    },
+    "issues": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "issue": {
+            "type": "string",
+            "description": "Description of the issue"
+          },
+          "severity": {
+            "type": "string",
+            "enum": ["minor", "significant", "critical"],
+            "description": "Severity of the issue"
+          }
+        },
+        "required": ["issue", "severity"]
+      },
+      "description": "List of specific issues found."
+    }
+  },
+  "required": ["pass", "reason", "issues"]
+}
+\`\`\`
+`;
+
+    // Calculate estimated tokens for rate limiting
+    const estimatedInputTokens = Math.ceil(evalPrompt.length / 2.5);
+
+    // Find the model config for the eval model
+    // We need to look it up from the models list or create a temporary config
+    // For now, we'll try to find it in the imported models list, or default to a safe config
+    const { modelsToTest } = await import("./models");
+    let evalModelConfig = modelsToTest.find((m) => m.name === evalModel);
+
+    if (!evalModelConfig) {
+      // If not found, create a temporary config with default limits
+      evalModelConfig = {
+        name: evalModel,
+        model: null, // We don't need the model object for rate limiting if we just use the name
+        requestsPerMinute: 60, // Safe default
+        tokensPerMinute: 100000, // Safe default
+      };
+    }
+
+    await rateLimiter.acquirePermit(evalModelConfig, estimatedInputTokens);
+
+    try {
+      const response = await ai.generate({
+        prompt: evalPrompt,
+        model: evalModelConfig.model || evalModel, // Use the model object if available, otherwise the string
+        config: evalModelConfig.config,
+        output: {
+          schema: EvalResultSchema,
+        },
+      });
+
+      // Parse the output
+      const result = response.output;
+      if (!result) {
+        throw new Error("No output from evaluation model");
+      }
+
+      return {
+        pass: result.pass,
+        reason: result.reason || "No reason provided",
+        issues: result.issues || [],
+        evalPrompt: evalPrompt,
+      };
+    } catch (e: any) {
+      logger.error(`Error during evaluation: ${e}`);
+      if (evalModelConfig) {
+        rateLimiter.reportError(evalModelConfig, e);
+      }
+      throw e; // Re-throw to let the retry logic handle it
+    }
+  }
+);
--- a/vendor/a2ui/specification/0.9/eval/src/evaluator.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/evaluator.ts
@@ -0,0 +1,205 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { evaluationFlow } from "./evaluation_flow";
+import { ValidatedResult, EvaluatedResult } from "./types";
+import { logger } from "./logger";
+import { rateLimiter } from "./rateLimiter";
+import * as fs from "fs";
+import * as path from "path";
+import * as yaml from "js-yaml";
+import { IssueSeverity } from "./types";
+
+export class Evaluator {
+  constructor(
+    private schemas: any,
+    private evalModel: string,
+    private outputDir?: string
+  ) {}
+
+  async run(results: ValidatedResult[]): Promise<EvaluatedResult[]> {
+    const passedResults = results.filter(
+      (r) => r.validationErrors.length === 0 && r.components
+    );
+    const skippedCount = results.length - passedResults.length;
+
+    logger.info(
+      `Starting Phase 3: LLM Evaluation (${passedResults.length} items to evaluate, ${skippedCount} skipped due to validation failure)`
+    );
+
+    const totalJobs = passedResults.length;
+    let completedCount = 0;
+    let failedCount = 0;
+    const evaluatedResults: EvaluatedResult[] = [];
+
+    // Initialize results with skipped items
+    for (const result of results) {
+      if (result.validationErrors.length > 0) {
+        evaluatedResults.push({
+          ...result,
+          evaluationResult: {
+            pass: false,
+            reason: "Schema validation failure",
+            issues: [
+              {
+                issue: result.validationErrors.join("\n"),
+                severity: "criticalSchema",
+              },
+            ],
+            overallSeverity: "criticalSchema",
+          },
+        });
+      } else if (!result.components) {
+        evaluatedResults.push({ ...result });
+      }
+    }
+
+    if (totalJobs === 0) {
+      logger.info("Phase 3: Evaluation Complete (No items to evaluate)");
+      return evaluatedResults;
+    }
+
+    const progressInterval = setInterval(() => {
+      const queuedCount = rateLimiter.waitingCount;
+      const inProgressCount =
+        totalJobs - completedCount - failedCount - queuedCount;
+      const pct = Math.round(
+        ((completedCount + failedCount) / totalJobs) * 100
+      );
+      process.stderr.write(
+        `\r[Phase 3] Progress: ${pct}% | Completed: ${completedCount} | In Progress: ${inProgressCount} | Queued: ${queuedCount} | Failed: ${failedCount}          `
+      );
+    }, 1000);
+
+    const promises = passedResults.map((result) =>
+      this.runJob(result).then((evalResult) => {
+        if (evalResult.evaluationResult) {
+          completedCount++;
+        } else {
+          failedCount++; // Failed to run evaluation flow (e.g. error)
+        }
+        evaluatedResults.push(evalResult);
+        return evalResult;
+      })
+    );
+
+    await Promise.all(promises);
+    clearInterval(progressInterval);
+    process.stderr.write("\n");
+    logger.info("Phase 3: Evaluation Complete");
+
+    return evaluatedResults;
+  }
+
+  private async runJob(result: ValidatedResult): Promise<EvaluatedResult> {
+    const maxEvalRetries = 3;
+    let evaluationResult:
+      | {
+          pass: boolean;
+          reason: string;
+          issues?: { issue: string; severity: IssueSeverity }[];
+        }
+      | undefined;
+
+    for (let evalRetry = 0; evalRetry < maxEvalRetries; evalRetry++) {
+      try {
+        evaluationResult = await evaluationFlow({
+          originalPrompt: result.prompt.promptText,
+          generatedOutput: result.rawText || "",
+          evalModel: this.evalModel,
+          schemas: this.schemas,
+        });
+        break;
+      } catch (e: any) {
+        if (evalRetry === maxEvalRetries - 1) {
+          logger.warn(
+            `Evaluation failed for ${result.prompt.name} run ${result.runNumber}: ${e.message}`
+          );
+          evaluationResult = {
+            pass: false,
+            reason: `Evaluation flow failed: ${e.message}`,
+          };
+        } else {
+          await new Promise((resolve) =>
+            setTimeout(resolve, 1000 * Math.pow(2, evalRetry))
+          );
+        }
+      }
+    }
+
+    let overallSeverity: IssueSeverity | undefined;
+    if (evaluationResult && !evaluationResult.pass && evaluationResult.issues) {
+      const severities = evaluationResult.issues.map((i) => i.severity);
+      if (severities.includes("critical")) {
+        overallSeverity = "critical";
+      } else if (severities.includes("significant")) {
+        overallSeverity = "significant";
+      } else if (severities.includes("minor")) {
+        overallSeverity = "minor";
+      }
+    }
+
+    if (this.outputDir && evaluationResult) {
+      this.saveEvaluation(result, evaluationResult, overallSeverity);
+    }
+
+    return {
+      ...result,
+      evaluationResult: evaluationResult
+        ? { ...evaluationResult, overallSeverity }
+        : undefined,
+    };
+  }
+
+  private saveEvaluation(
+    result: ValidatedResult,
+    evaluationResult: {
+      pass: boolean;
+      reason: string;
+      issues?: { issue: string; severity: IssueSeverity }[];
+      evalPrompt?: string;
+    },
+    overallSeverity?: IssueSeverity
+  ) {
+    if (!this.outputDir) return;
+
+    // Only save if the evaluation failed
+    if (evaluationResult.pass) return;
+
+    const modelDir = path.join(
+      this.outputDir,
+      `output-${result.modelName.replace(/[\/:]/g, "_")}`
+    );
+    const detailsDir = path.join(modelDir, "details");
+    fs.writeFileSync(
+      path.join(
+        detailsDir,
+        `${result.prompt.name}.${result.runNumber}.failed.yaml`
+      ),
+      yaml.dump({ ...evaluationResult, overallSeverity })
+    );
+
+    if (evaluationResult.evalPrompt) {
+      fs.writeFileSync(
+        path.join(
+          detailsDir,
+          `${result.prompt.name}.${result.runNumber}.eval_prompt.txt`
+        ),
+        evaluationResult.evalPrompt
+      );
+    }
+  }
+}
--- a/vendor/a2ui/specification/0.9/eval/src/generation_flow.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/generation_flow.ts
@@ -0,0 +1,148 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { z } from "genkit";
+import { ai } from "./ai";
+import { ModelConfiguration } from "./models";
+import { rateLimiter } from "./rateLimiter";
+import { logger } from "./logger";
+
+// Define a UI component generator flow
+export const componentGeneratorFlow = ai.defineFlow(
+  {
+    name: "componentGeneratorFlow",
+    inputSchema: z.object({
+      prompt: z.string(),
+      modelConfig: z.any(), // Ideally, we'd have a Zod schema for ModelConfiguration
+      schemas: z.any(),
+      catalogRules: z.string().optional(),
+    }),
+    outputSchema: z.any(),
+  },
+  async ({ prompt, modelConfig, schemas, catalogRules }) => {
+    const schemaDefs = Object.values(schemas)
+      .map((s: any) => JSON.stringify(s, null, 2))
+      .join("\n\n");
+
+    const fullPrompt = `You are an AI assistant. Based on the following request, generate a stream of JSON messages that conform to the provided JSON Schemas.
+The output MUST be a series of JSON objects, each enclosed in a markdown code block (or a single block with multiple objects).
+
+Standard Instructions:
+1. Generate a 'createSurface' message with surfaceId 'main' and catalogId 'https://a2ui.dev/specification/0.9/standard_catalog_definition.json'.
+2. Generate a 'updateComponents' message with surfaceId 'main' containing the requested UI.
+3. Ensure all component children are referenced by ID (using the 'children' or 'child' property with IDs), NOT nested inline as objects.
+4. If the request involves data binding, you may also generate 'updateDataModel' messages.
+5. Among the 'updateComponents' messages in the output, there MUST be one root component with id: 'root'.
+6. Components need to be nested within a root layout container (Column, Row). No need to add an extra container if the root is already a layout container.
+7. There shouldn't be any orphaned components: no components should be generated which don't have a parent, except for the root component.
+8. Do NOT output a list of lists (e.g. [[...]]). Output individual JSON objects separated by newlines.
+9. STRICTLY follow the JSON Schemas. Do NOT add any properties that are not defined in the schema. Ensure ALL required properties are present.
+10. Do NOT invent data bindings or action contexts. Only use them if the prompt explicitly asks for them.
+11. Read the 'description' field of each component in the schema carefully. It contains critical usage instructions (e.g. regarding labels, single child limits, and layout behavior) that you MUST follow.
+12. Do NOT define components inline inside 'child' or 'children'. Always use a string ID referencing a separate component definition.
+13. Do NOT use a 'style' property. Use standard properties like 'alignment', 'distribution', 'usageHint', etc.
+14. Do NOT invent properties that are not in the schema. Check the 'properties' list for each component type.
+${catalogRules ? `\nInstructions specific to this catalog:\n${catalogRules}` : ""}
+
+Schemas:
+${schemaDefs}
+
+Request:
+${prompt}
+`;
+    const estimatedInputTokens = Math.ceil(fullPrompt.length / 2.5);
+    await rateLimiter.acquirePermit(
+      modelConfig as ModelConfiguration,
+      estimatedInputTokens
+    );
+
+    // Generate text response
+    let response;
+    const startTime = Date.now();
+    try {
+      response = await ai.generate({
+        prompt: fullPrompt,
+        model: modelConfig.model,
+        config: modelConfig.config,
+      });
+    } catch (e) {
+      logger.error(`Error during ai.generate: ${e}`);
+      rateLimiter.reportError(modelConfig as ModelConfiguration, e);
+      throw e;
+    }
+    const latency = Date.now() - startTime;
+
+    if (!response) throw new Error("Failed to generate component");
+
+    let candidate = (response as any).candidates?.[0];
+
+    // Fallback for different response structure (e.g. Genkit 0.9+ or specific model adapters)
+    if (!candidate && (response as any).message) {
+      const message = (response as any).message;
+      candidate = {
+        index: 0,
+        content: message.content,
+        finishReason: "STOP", // Assume STOP if not provided in this format
+        message: message,
+      };
+    }
+
+    if (!candidate) {
+      logger.error(
+        `No candidates returned in response. Full response: ${JSON.stringify(response, null, 2)}`
+      );
+      throw new Error("No candidates returned");
+    }
+
+    if (
+      candidate.finishReason !== "STOP" &&
+      candidate.finishReason !== undefined
+    ) {
+      logger.warn(
+        `Model finished with reason: ${candidate.finishReason}. Content: ${JSON.stringify(
+          candidate.content
+        )}`
+      );
+    }
+
+    // Record token usage (adjusting for actual usage)
+    const inputTokens = response.usage?.inputTokens || 0;
+    const outputTokens = response.usage?.outputTokens || 0;
+    const totalTokens = inputTokens + outputTokens;
+
+    // We already recorded estimatedInputTokens. We need to record the difference.
+    // If actual > estimated, we record the positive difference.
+    // If actual < estimated, we technically over-counted, but RateLimiter doesn't support negative adjustments yet.
+    // For safety, we just record any *additional* tokens if we under-estimated.
+    // And we definitely record the output tokens.
+
+    const additionalInputTokens = Math.max(
+      0,
+      inputTokens - estimatedInputTokens
+    );
+    const tokensToAdd = additionalInputTokens + outputTokens;
+
+    if (tokensToAdd > 0) {
+      rateLimiter.recordUsage(
+        modelConfig as ModelConfiguration,
+        tokensToAdd,
+        false
+      );
+    }
+
+    return { text: response.text, latency };
+  }
+);
--- a/vendor/a2ui/specification/0.9/eval/src/generator.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/generator.ts
@@ -0,0 +1,211 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { componentGeneratorFlow } from "./generation_flow";
+import { ModelConfiguration } from "./models";
+import { TestPrompt } from "./prompts";
+import { GeneratedResult } from "./types";
+import { extractJsonFromMarkdown } from "./utils";
+import { rateLimiter } from "./rateLimiter";
+import { logger } from "./logger";
+import * as fs from "fs";
+import * as path from "path";
+
+export class Generator {
+  constructor(
+    private schemas: any,
+    private outputDir?: string,
+    private catalogRules?: string
+  ) {}
+
+  async run(
+    prompts: TestPrompt[],
+    models: ModelConfiguration[],
+    runsPerPrompt: number
+  ): Promise<GeneratedResult[]> {
+    const totalJobs = prompts.length * models.length * runsPerPrompt;
+    let completedCount = 0;
+    let failedCount = 0;
+    const results: GeneratedResult[] = [];
+    const promises: Promise<GeneratedResult>[] = [];
+
+    logger.info(`Starting Phase 1: Generation (${totalJobs} jobs)`);
+
+    const progressInterval = setInterval(() => {
+      const queuedCount = rateLimiter.waitingCount;
+      const inProgressCount =
+        totalJobs - completedCount - failedCount - queuedCount;
+      const pct =
+        totalJobs > 0
+          ? Math.round(((completedCount + failedCount) / totalJobs) * 100)
+          : 0;
+      process.stderr.write(
+        `\r[Phase 1] Progress: ${pct}% | Completed: ${completedCount} | In Progress: ${inProgressCount} | Queued: ${queuedCount} | Failed: ${failedCount}          `
+      );
+    }, 1000);
+
+    for (const model of models) {
+      for (const prompt of prompts) {
+        for (let i = 1; i <= runsPerPrompt; i++) {
+          promises.push(
+            this.runJob(model, prompt, i).then((result) => {
+              if (result.error) {
+                failedCount++;
+              } else {
+                completedCount++;
+              }
+              results.push(result);
+              return result;
+            })
+          );
+        }
+      }
+    }
+
+    await Promise.all(promises);
+    clearInterval(progressInterval);
+    process.stderr.write("\n");
+    logger.info("Phase 1: Generation Complete");
+
+    return results;
+  }
+
+  private async runJob(
+    model: ModelConfiguration,
+    prompt: TestPrompt,
+    runIndex: number,
+    retryCount: number = 0
+  ): Promise<GeneratedResult> {
+    const startTime = Date.now();
+    try {
+      const output: any = await componentGeneratorFlow({
+        prompt: prompt.promptText,
+        modelConfig: model,
+        schemas: this.schemas,
+        catalogRules: this.catalogRules,
+      });
+
+      const text = output?.text;
+      const latency = output?.latency || 0;
+      let components: any[] = [];
+      let error = null;
+
+      if (text) {
+        try {
+          components = extractJsonFromMarkdown(text);
+          if (this.outputDir) {
+            this.saveArtifacts(model, prompt, runIndex, text, components);
+          }
+        } catch (e) {
+          error = e;
+          if (this.outputDir) {
+            this.saveError(model, prompt, runIndex, text, e);
+          }
+        }
+      } else {
+        error = new Error("No output text returned from model");
+      }
+
+      return {
+        modelName: model.name,
+        prompt,
+        runNumber: runIndex,
+        rawText: text,
+        components,
+        latency,
+        error,
+      };
+    } catch (error: any) {
+      if (retryCount < 1) {
+        // Simple retry for tool errors
+        return this.runJob(model, prompt, runIndex, retryCount + 1);
+      }
+      return {
+        modelName: model.name,
+        prompt,
+        runNumber: runIndex,
+        latency: Date.now() - startTime,
+        error,
+      };
+    }
+  }
+
+  private saveArtifacts(
+    model: ModelConfiguration,
+    prompt: TestPrompt,
+    runIndex: number,
+    text: string,
+    components: any[]
+  ) {
+    if (!this.outputDir) return;
+    const modelDir = path.join(
+      this.outputDir,
+      `output-${model.name.replace(/[\/:]/g, "_")}`
+    );
+    const detailsDir = path.join(modelDir, "details");
+    fs.mkdirSync(detailsDir, { recursive: true });
+
+    fs.writeFileSync(
+      path.join(detailsDir, `${prompt.name}.${runIndex}.json`),
+      JSON.stringify(components, null, 2)
+    );
+
+    const samplePath = path.join(
+      detailsDir,
+      `${prompt.name}.${runIndex}.sample`
+    );
+    const yamlHeader = `---
+description: ${prompt.description}
+name: ${prompt.name}
+prompt: |
+${prompt.promptText
+  .split("\n")
+  .map((line) => "  " + line)
+  .join("\n")}
+---
+`;
+    let jsonlBody = "";
+    for (const comp of components) {
+      jsonlBody += JSON.stringify(comp) + "\n";
+    }
+    fs.writeFileSync(samplePath, yamlHeader + jsonlBody);
+  }
+
+  private saveError(
+    model: ModelConfiguration,
+    prompt: TestPrompt,
+    runIndex: number,
+    text: string | undefined,
+    error: any
+  ) {
+    if (!this.outputDir) return;
+    const modelDir = path.join(
+      this.outputDir,
+      `output-${model.name.replace(/[\/:]/g, "_")}`
+    );
+    const detailsDir = path.join(modelDir, "details");
+    fs.mkdirSync(detailsDir, { recursive: true });
+
+    fs.writeFileSync(
+      path.join(detailsDir, `${prompt.name}.${runIndex}.output.txt`),
+      text || "No output"
+    );
+    fs.writeFileSync(
+      path.join(detailsDir, `${prompt.name}.${runIndex}.error.json`),
+      JSON.stringify({ message: error.message, stack: error.stack }, null, 2)
+    );
+  }
+}
--- a/vendor/a2ui/specification/0.9/eval/src/index.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/index.ts
@@ -0,0 +1,493 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import * as fs from "fs";
+import * as path from "path";
+import yargs from "yargs";
+import { hideBin } from "yargs/helpers";
+import { logger, setupLogger } from "./logger";
+import { modelsToTest } from "./models";
+import { prompts, TestPrompt } from "./prompts";
+import { Generator } from "./generator";
+import { Validator } from "./validator";
+import { Evaluator } from "./evaluator";
+import { EvaluatedResult } from "./types";
+import { analysisFlow } from "./analysis_flow";
+
+const schemaFiles = [
+  "../../json/common_types.json",
+  "../../json/standard_catalog_definition.json",
+  "../../json/server_to_client.json",
+];
+
+function loadSchemas(): Record<string, any> {
+  const schemas: Record<string, any> = {};
+  for (const file of schemaFiles) {
+    const schemaString = fs.readFileSync(path.join(__dirname, file), "utf-8");
+    const schema = JSON.parse(schemaString);
+    schemas[path.basename(file)] = schema;
+  }
+  return schemas;
+}
+
+function generateSummary(
+  results: EvaluatedResult[],
+  analysisResults: Record<string, string>
+): string {
+  const promptNameWidth = 40;
+  const latencyWidth = 20;
+  const failedRunsWidth = 15;
+  const severityWidth = 15;
+
+  // Group by model
+  const resultsByModel: Record<string, EvaluatedResult[]> = {};
+  for (const result of results) {
+    if (!resultsByModel[result.modelName]) {
+      resultsByModel[result.modelName] = [];
+    }
+    resultsByModel[result.modelName].push(result);
+  }
+
+  let summary = "# Evaluation Summary";
+  for (const modelName in resultsByModel) {
+    summary += `\n\n## Model: ${modelName}\n\n`;
+    const header = `| ${"Prompt Name".padEnd(
+      promptNameWidth
+    )} | ${"Avg Latency (ms)".padEnd(latencyWidth)} | ${"Schema Fail".padEnd(
+      failedRunsWidth
+    )} | ${"Eval Fail".padEnd(failedRunsWidth)} | ${"Minor".padEnd(
+      severityWidth
+    )} | ${"Significant".padEnd(severityWidth)} | ${"Critical".padEnd(
+      severityWidth
+    )} |`;
+    const divider = `|${"-".repeat(promptNameWidth + 2)}|${"-".repeat(
+      latencyWidth + 2
+    )}|${"-".repeat(failedRunsWidth + 2)}|${"-".repeat(
+      failedRunsWidth + 2
+    )}|${"-".repeat(severityWidth + 2)}|${"-".repeat(
+      severityWidth + 2
+    )}|${"-".repeat(severityWidth + 2)}|`;
+    summary += header;
+    summary += `\n${divider}`;
+
+    const modelResults = resultsByModel[modelName];
+    const promptsInModel = modelResults.reduce(
+      (acc, result) => {
+        if (!acc[result.prompt.name]) {
+          acc[result.prompt.name] = [];
+        }
+        acc[result.prompt.name].push(result);
+        return acc;
+      },
+      {} as Record<string, EvaluatedResult[]>
+    );
+
+    const sortedPromptNames = Object.keys(promptsInModel).sort();
+    for (const promptName of sortedPromptNames) {
+      const runs = promptsInModel[promptName];
+      const totalRuns = runs.length;
+      const schemaFailedRuns = runs.filter(
+        (r) => r.error || r.validationErrors.length > 0
+      ).length;
+      const evalFailedRuns = runs.filter(
+        (r) => r.evaluationResult && !r.evaluationResult.pass
+      ).length;
+
+      const totalLatency = runs.reduce((acc, r) => acc + r.latency, 0);
+      const avgLatency = (totalLatency / totalRuns).toFixed(0);
+
+      const schemaFailedStr =
+        schemaFailedRuns > 0 ? `${schemaFailedRuns} / ${totalRuns}` : "";
+      const evalFailedStr =
+        evalFailedRuns > 0 ? `${evalFailedRuns} / ${totalRuns}` : "";
+
+      let minorCount = 0;
+      let significantCount = 0;
+      let criticalCount = 0;
+
+      for (const r of runs) {
+        if (r.evaluationResult?.issues) {
+          for (const issue of r.evaluationResult.issues) {
+            if (issue.severity === "minor") minorCount++;
+            else if (issue.severity === "significant") significantCount++;
+            else if (issue.severity === "critical") criticalCount++;
+          }
+        }
+      }
+
+      const minorStr = minorCount > 0 ? `${minorCount}` : "";
+      const significantStr = significantCount > 0 ? `${significantCount}` : "";
+      const criticalStr = criticalCount > 0 ? `${criticalCount}` : "";
+
+      summary += `\n| ${promptName.padEnd(
+        promptNameWidth
+      )} | ${avgLatency.padEnd(latencyWidth)} | ${schemaFailedStr.padEnd(
+        failedRunsWidth
+      )} | ${evalFailedStr.padEnd(failedRunsWidth)} | ${minorStr.padEnd(
+        severityWidth
+      )} | ${significantStr.padEnd(severityWidth)} | ${criticalStr.padEnd(
+        severityWidth
+      )} |`;
+    }
+
+    const totalRunsForModel = modelResults.length;
+    const successfulRuns = modelResults.filter(
+      (r) =>
+        !r.error &&
+        r.validationErrors.length === 0 &&
+        (!r.evaluationResult || r.evaluationResult.pass)
+    ).length;
+
+    const successPercentage =
+      totalRunsForModel === 0
+        ? "0.0"
+        : ((successfulRuns / totalRunsForModel) * 100.0).toFixed(1);
+
+    summary += `\n\n**Total successful runs:** ${successfulRuns} / ${totalRunsForModel} (${successPercentage}% success)`;
+
+    if (analysisResults[modelName]) {
+      summary += `\n\n### Failure Analysis\n\n${analysisResults[modelName]}`;
+    }
+  }
+
+  summary += "\n\n---\n\n## Overall Summary\n";
+  const totalRuns = results.length;
+  const totalToolErrorRuns = results.filter((r) => r.error).length;
+  const totalRunsWithAnyFailure = results.filter(
+    (r) =>
+      r.error ||
+      r.validationErrors.length > 0 ||
+      (r.evaluationResult && !r.evaluationResult.pass)
+  ).length;
+
+  const modelsWithFailures = [
+    ...new Set(
+      results
+        .filter(
+          (r) =>
+            r.error ||
+            r.validationErrors.length > 0 ||
+            (r.evaluationResult && !r.evaluationResult.pass)
+        )
+        .map((r) => r.modelName)
+    ),
+  ].join(", ");
+
+  let totalMinor = 0;
+  let totalSignificant = 0;
+  let totalCritical = 0;
+  let totalCriticalSchema = 0;
+
+  for (const r of results) {
+    if (r.evaluationResult?.issues) {
+      for (const issue of r.evaluationResult.issues) {
+        if (issue.severity === "minor") totalMinor++;
+        else if (issue.severity === "significant") totalSignificant++;
+        else if (issue.severity === "critical") totalCritical++;
+        else if (issue.severity === "criticalSchema") totalCriticalSchema++;
+      }
+    }
+  }
+
+  summary += `\n- **Total tool failures:** ${totalToolErrorRuns} / ${totalRuns}`;
+  const successPercentage =
+    totalRuns === 0
+      ? "0.0"
+      : (((totalRuns - totalRunsWithAnyFailure) / totalRuns) * 100.0).toFixed(
+          1
+        );
+  summary += `\n- **Number of runs with any failure (tool error, validation, or eval):** ${totalRunsWithAnyFailure} / ${totalRuns} (${successPercentage}% success)`;
+  summary += `\n- **Severity Breakdown:**`;
+  summary += `\n  - **Minor:** ${totalMinor}`;
+  summary += `\n  - **Significant:** ${totalSignificant}`;
+  summary += `\n  - **Critical (Eval):** ${totalCritical}`;
+  summary += `\n  - **Critical (Schema):** ${totalCriticalSchema}`;
+
+  const latencies = results.map((r) => r.latency).sort((a, b) => a - b);
+  const totalLatency = latencies.reduce((acc, l) => acc + l, 0);
+  const meanLatency =
+    totalRuns > 0 ? (totalLatency / totalRuns).toFixed(0) : "0";
+  let medianLatency = 0;
+  if (latencies.length > 0) {
+    const mid = Math.floor(latencies.length / 2);
+    if (latencies.length % 2 === 0) {
+      medianLatency = (latencies[mid - 1] + latencies[mid]) / 2;
+    } else {
+      medianLatency = latencies[mid];
+    }
+  }
+
+  summary += `\n- **Mean Latency:** ${meanLatency} ms`;
+  summary += `\n- **Median Latency:** ${medianLatency} ms`;
+
+  if (modelsWithFailures) {
+    summary += `\n- **Models with at least one failure:** ${modelsWithFailures}`;
+  }
+  return summary;
+}
+
+async function main() {
+  const argv = await yargs(hideBin(process.argv))
+    .option("log-level", {
+      type: "string",
+      description: "Set the logging level",
+      default: "info",
+      choices: ["debug", "info", "warn", "error"],
+    })
+    .option("results", {
+      type: "string",
+      description:
+        "Directory to keep output files. If not specified, uses results/output-<model>. If specified, uses the provided directory (appending output-<model>).",
+      coerce: (arg) => (arg === undefined ? true : arg),
+      default: true,
+    })
+    .option("runs-per-prompt", {
+      type: "number",
+      description: "Number of times to run each prompt",
+      default: 1,
+    })
+    .option("model", {
+      type: "string",
+      array: true,
+      description: "Filter models by exact name",
+      default: [],
+      choices: modelsToTest.map((m) => m.name),
+    })
+    .option("prompt", {
+      type: "string",
+      array: true,
+      description: "Filter prompts by name prefix",
+    })
+    .option("eval-model", {
+      type: "string",
+      description: "Model to use for evaluation",
+      default: "gemini-2.5-flash",
+      choices: modelsToTest.map((m) => m.name),
+    })
+    .option("clean-results", {
+      type: "boolean",
+      description: "Clear the output directory before starting",
+      default: false,
+    })
+
+    .help()
+    .alias("h", "help")
+    .strict().argv;
+
+  // Filter Models
+  let filteredModels = modelsToTest;
+  if (argv.model && argv.model.length > 0) {
+    const modelNames = argv.model as string[];
+    filteredModels = modelsToTest.filter((m) => modelNames.includes(m.name));
+    if (filteredModels.length === 0) {
+      logger.error(`No models found matching: ${modelNames.join(", ")}.`);
+      process.exit(1);
+    }
+  }
+
+  // Filter Prompts
+  let filteredPrompts = prompts;
+  if (argv.prompt && argv.prompt.length > 0) {
+    const promptPrefixes = argv.prompt as string[];
+    filteredPrompts = prompts.filter((p) =>
+      promptPrefixes.some((prefix) => p.name.startsWith(prefix))
+    );
+    if (filteredPrompts.length === 0) {
+      logger.error(
+        `No prompt found with prefix "${promptPrefixes.join(", ")}".`
+      );
+      process.exit(1);
+    }
+  }
+
+  // Determine Output Directory (Base)
+  // Note: Generator/Validator/Evaluator handle per-model subdirectories if outputDir is provided.
+  // But we need a base output dir to pass to them.
+  let resultsBaseDir: string | undefined;
+  const resultsArg = argv.results;
+  if (typeof resultsArg === "string") {
+    resultsBaseDir = resultsArg;
+  } else if (resultsArg === true) {
+    resultsBaseDir = "results";
+  }
+
+  // Clean Results
+  if (
+    argv["clean-results"] &&
+    resultsBaseDir &&
+    fs.existsSync(resultsBaseDir)
+  ) {
+    // Only clean if we are using the default structure or explicit path
+    // We should be careful not to delete root if user passed "/" (unlikely but possible)
+    // For safety, let's iterate over models and clean their specific dirs if they exist
+    // Or just clean the base dir if it looks like our results dir.
+    // The previous logic cleaned `outputDir` which was per-model.
+    // Here we might want to clean the whole results dir if it's the default "results".
+    if (resultsBaseDir === "results") {
+      fs.rmSync(resultsBaseDir, { recursive: true, force: true });
+    } else {
+      // If custom dir, maybe just clean it?
+      // User asked to clean results.
+      fs.rmSync(resultsBaseDir, { recursive: true, force: true });
+    }
+  }
+
+  // Setup Logger (Global)
+  // We need to setup logger to write to file?
+  // Previous logic setup logger per model output dir.
+  // Now we have multiple models potentially.
+  // We can setup logger to write to stdout/stderr primarily, and maybe a global log file?
+  // Or we can setup logger to NOT write to file, and let phases write their own logs?
+  // The `setupLogger` function takes an outputDir.
+  // If we have multiple models, where do we log?
+  // Maybe just log to the first model's dir or a "latest" dir?
+  // Or just console for now if multiple models?
+  // If single model, use that model's dir.
+
+  if (resultsBaseDir) {
+    if (filteredModels.length === 1) {
+      const modelDirName = `output-${filteredModels[0].name.replace(/[\/:]/g, "_")}`;
+      setupLogger(path.join(resultsBaseDir, modelDirName), argv["log-level"]);
+    } else {
+      // If multiple models, maybe just log to console or a shared log?
+      // For now, let's just use console logging (default if setupLogger not called with dir?)
+      // Actually setupLogger needs a dir to create 'eval.log'.
+      // Let's create a 'combined' log if multiple models?
+      // Or just skip file logging for multiple models for now.
+      setupLogger(undefined, argv["log-level"]);
+    }
+  } else {
+    setupLogger(undefined, argv["log-level"]);
+  }
+
+  const schemas = loadSchemas();
+  const catalogRulesPath = path.join(
+    __dirname,
+    "../../json/standard_catalog_rules.txt"
+  );
+  let catalogRules: string | undefined;
+  if (fs.existsSync(catalogRulesPath)) {
+    catalogRules = fs.readFileSync(catalogRulesPath, "utf-8");
+  } else {
+    logger.warn(
+      `Catalog rules file not found at ${catalogRulesPath}. Proceeding without specific catalog rules.`
+    );
+  }
+
+  // Phase 1: Generation
+  const generator = new Generator(schemas, resultsBaseDir, catalogRules);
+  const generatedResults = await generator.run(
+    filteredPrompts,
+    filteredModels,
+    argv["runs-per-prompt"]
+  );
+
+  // Phase 2: Validation
+  const validator = new Validator(schemas, resultsBaseDir);
+  const validatedResults = await validator.run(generatedResults);
+
+  // Phase 3: Evaluation
+  const evaluator = new Evaluator(schemas, argv["eval-model"], resultsBaseDir);
+  const evaluatedResults = await evaluator.run(validatedResults);
+
+  // Phase 4: Failure Analysis
+  const analysisResults: Record<string, string> = {};
+  const resultsByModel: Record<string, EvaluatedResult[]> = {};
+  for (const result of evaluatedResults) {
+    if (!resultsByModel[result.modelName]) {
+      resultsByModel[result.modelName] = [];
+    }
+    resultsByModel[result.modelName].push(result);
+  }
+
+  for (const modelName in resultsByModel) {
+    const modelResults = resultsByModel[modelName];
+    const failures = modelResults
+      .filter(
+        (r) =>
+          r.error ||
+          r.validationErrors.length > 0 ||
+          (r.evaluationResult && !r.evaluationResult.pass)
+      )
+      .map((r) => {
+        let failureType = "Unknown";
+        let reason = "Unknown";
+        let issues: string[] = [];
+
+        if (r.error) {
+          failureType = "Tool Error";
+          reason = r.error.message || String(r.error);
+        } else if (r.validationErrors.length > 0) {
+          failureType = "Schema Validation";
+          reason = "Schema validation failed";
+          issues = r.validationErrors;
+        } else if (r.evaluationResult && !r.evaluationResult.pass) {
+          failureType = "Evaluation Failure";
+          reason = r.evaluationResult.reason;
+          if (r.evaluationResult.issues) {
+            issues = r.evaluationResult.issues.map(
+              (i) => `${i.severity}: ${i.issue}`
+            );
+          }
+        }
+
+        return {
+          promptName: r.prompt.name,
+          runNumber: r.runNumber,
+          failureType,
+          reason,
+          issues,
+        };
+      });
+
+    if (failures.length > 0) {
+      logger.info(`Running failure analysis for model: ${modelName}...`);
+      try {
+        const analysis = await analysisFlow({
+          modelName,
+          failures,
+          numRuns: modelResults.length,
+          evalModel: argv["eval-model"],
+        });
+        analysisResults[modelName] = analysis;
+      } catch (e) {
+        logger.error(`Failed to run failure analysis for ${modelName}: ${e}`);
+        analysisResults[modelName] = "Failed to run analysis.";
+      }
+    }
+  }
+
+  // Summary
+  const summary = generateSummary(evaluatedResults, analysisResults);
+  logger.info(summary);
+
+  if (resultsBaseDir) {
+    // Save summary to each model dir?
+    // Or just one summary?
+    // Previous logic saved summary.md in model dir.
+    for (const model of filteredModels) {
+      const modelDirName = `output-${model.name.replace(/[\/:]/g, "_")}`;
+      const modelDir = path.join(resultsBaseDir, modelDirName);
+      if (fs.existsSync(modelDir)) {
+        fs.writeFileSync(path.join(modelDir, "summary.md"), summary);
+      }
+    }
+  }
+}
+
+if (require.main === module) {
+  main().catch(console.error);
+}
--- a/vendor/a2ui/specification/0.9/eval/src/logger.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/logger.ts
@@ -0,0 +1,70 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import * as winston from "winston";
+import * as path from "path";
+
+let fileTransport: winston.transport | null = null;
+
+const consoleTransport = new winston.transports.Console({
+  level: "info", // Default to info, can be updated later
+  format: winston.format.combine(
+    winston.format.colorize(),
+    winston.format.printf(({ timestamp, level, message }) => {
+      // Clear the current line (where progress bar might be) before logging
+      // \r clears the line, \x1b[K clears from cursor to end of line
+      return `\r\x1b[K${timestamp} [${level}]: ${message}`;
+    })
+  ),
+});
+
+// Create a default logger instance that logs to console only initially
+export const logger = winston.createLogger({
+  level: "debug", // Allow all logs to flow through (transports can filter)
+  format: winston.format.combine(
+    winston.format.timestamp(),
+    winston.format.printf(({ timestamp, level, message }) => {
+      return `${timestamp} [${level}]: ${message}`;
+    })
+  ),
+  transports: [consoleTransport],
+});
+
+export function setupLogger(outputDir: string | undefined, logLevel: string) {
+  // Ensure the global level allows debug logs so they reach the file transport
+  logger.level = "debug";
+
+  // Update Console transport level to match user preference directly
+  consoleTransport.level = logLevel;
+
+  if (fileTransport) {
+    logger.remove(fileTransport);
+    fileTransport = null;
+  }
+
+  if (outputDir) {
+    fileTransport = new winston.transports.File({
+      filename: path.join(outputDir, "output.log"),
+      level: "debug", // Always capture everything in the file
+      format: winston.format.combine(
+        winston.format.timestamp(),
+        winston.format.json()
+      ),
+    });
+
+    logger.add(fileTransport);
+  }
+}
--- a/vendor/a2ui/specification/0.9/eval/src/models.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/models.ts
@@ -0,0 +1,93 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { googleAI } from "@genkit-ai/google-genai";
+import { openAI } from "@genkit-ai/compat-oai/openai";
+import { claude35Haiku, claude4Sonnet } from "genkitx-anthropic";
+
+export interface ModelConfiguration {
+  model: any;
+  name: string;
+  config?: any;
+  requestsPerMinute?: number;
+  tokensPerMinute?: number;
+}
+
+export const modelsToTest: ModelConfiguration[] = [
+  {
+    model: openAI.model("gpt-5.1"),
+    name: "gpt-5.1",
+    config: { reasoning_effort: "minimal" },
+    requestsPerMinute: 500,
+    tokensPerMinute: 30000,
+  },
+  {
+    model: openAI.model("gpt-5-mini"),
+    name: "gpt-5-mini",
+    config: { reasoning_effort: "minimal" },
+    requestsPerMinute: 500,
+    tokensPerMinute: 500000,
+  },
+  {
+    model: openAI.model("gpt-5-nano"),
+    name: "gpt-5-nano",
+    config: {},
+    requestsPerMinute: 500,
+    tokensPerMinute: 200000,
+  },
+  {
+    model: googleAI.model("gemini-2.5-pro"),
+    name: "gemini-2.5-pro",
+    config: { thinkingConfig: { thinkingBudget: 1000 } },
+    requestsPerMinute: 150,
+    tokensPerMinute: 2000000,
+  },
+  {
+    model: googleAI.model("gemini-3-pro-preview"),
+    name: "gemini-3-pro",
+    config: { thinkingConfig: { thinkingBudget: 1000 } },
+    requestsPerMinute: 50,
+    tokensPerMinute: 1000000,
+  },
+  {
+    model: googleAI.model("gemini-2.5-flash"),
+    name: "gemini-2.5-flash",
+    config: { thinkingConfig: { thinkingBudget: 0 } },
+    requestsPerMinute: 1000,
+    tokensPerMinute: 1000000,
+  },
+  {
+    model: googleAI.model("gemini-2.5-flash-lite"),
+    name: "gemini-2.5-flash-lite",
+    config: { thinkingConfig: { thinkingBudget: 0 } },
+    requestsPerMinute: 4000,
+    tokensPerMinute: 1200000,
+  },
+  {
+    model: claude4Sonnet,
+    name: "claude-4-sonnet",
+    config: {},
+    requestsPerMinute: 50,
+    tokensPerMinute: 30000,
+  },
+  {
+    model: claude35Haiku,
+    name: "claude-35-haiku",
+    config: {},
+    requestsPerMinute: 50,
+    tokensPerMinute: 50000,
+  },
+];
--- a/vendor/a2ui/specification/0.9/eval/src/prompts.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/prompts.ts
@@ -0,0 +1,373 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+
+export interface TestPrompt {
+  name: string;
+  description: string;
+  promptText: string;
+}
+
+export const prompts: TestPrompt[] = [
+  {
+    name: "deleteSurface",
+    description: "A DeleteSurface message to remove a UI surface.",
+    promptText: `Generate a JSON message containing a deleteSurface for the surface 'dashboard-surface-1'.`,
+  },
+  {
+    name: "dogBreedGenerator",
+    description:
+      "A prompt to generate a UI for a dog breed information and generator tool.",
+    promptText: `Use a surfaceId of 'main'. Then, generate a 'createSurface' message followed by 'updateComponents' message to describe the following UI:
+
+A vertical list with:
+- Dog breed information
+- Dog generator
+
+The dog breed information is a card, which contains a title “Famous Dog breeds”, a header image, and a horizontal list of images of different dog breeds. The list information should be in the data model at /breeds.
+
+The dog generator is another card which is a form that generates a fictional dog breed with a description
+- Title
+- Description text explaining what it is
+- Dog breed name (text input)
+- Number of legs (number input)
+- Button called “Generate” which takes the data above and generates a new dog description
+- Skills (ChoicePicker component, usageHint 'multipleSelection')
+- A divider
+- A section which shows the generated content
+`,
+  },
+  {
+    name: "loginForm",
+    description:
+      'A simple login form with username, password, a "remember me" checkbox, and a submit button.',
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a login form. It should have a "Login" text (usageHint 'h1'), two text fields for username and password (bound to /login/username and /login/password), a checkbox for "Remember Me" (bound to /login/rememberMe), and a "Sign In" button. The button should trigger a 'login' action, passing the username, password, and rememberMe status in the dynamicContext.`,
+  },
+  {
+    name: "productGallery",
+    description: "A gallery of products using a list with a template.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a product gallery. It should display a list of products from the data model at '/products'. Use a template for the list items. Each item should be a Card containing a Column. The Column should contain an Image (from '/products/item/imageUrl'), a Text component for the product name (from '/products/item/name'), and a Button labeled "Add to Cart". The button's action should be 'addToCart' and include a context with the product ID, for example, 'productId': 'static-id-123' (use this exact literal string). You should create a template component and then a list that uses it.`,
+  },
+  {
+    name: "productGalleryData",
+    description:
+      "An updateDataModel message to populate the product gallery data.",
+    promptText: `Generate a 'createSurface' message with surfaceId 'main', followed by an updateDataModel message to populate the data model for the product gallery. The update should target the path '/products' and include at least two products. Each product in the map should have keys 'id', 'name', and 'imageUrl'. For example:
+    {
+      "product1": {
+        "id": "product1",
+        "name": "Awesome Gadget",
+        "imageUrl": "https://example.com/gadget.jpg"
+      }
+    }`,
+  },
+  {
+    name: "settingsPage",
+    description: "A settings page with tabs and a modal dialog.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a user settings page. Use a Tabs component with two tabs: "Profile" and "Notifications". The "Profile" tab should contain a simple column with a text field for the user's name. The "Notifications" tab should contain a checkbox for "Enable email notifications". Also, include a Modal component. The modal's entry point should be a button labeled "Delete Account", and its content should be a column with a confirmation text and two buttons: "Confirm Deletion" and "Cancel".`,
+  },
+  {
+    name: "updateDataModel",
+    description: "An updateDataModel message to update user data.",
+    promptText: `Generate a 'createSurface' message with surfaceId 'main', followed by an updateDataModel message. This is used to update the client's data model. The scenario is that a user has just logged in, and we need to populate their profile information. Create a single data model update message to set '/user/name' to "John Doe" and '/user/email' to "john.doe@example.com".`,
+  },
+  {
+    name: "animalKingdomExplorer",
+    description: "A simple, explicit UI to display a hierarchy of animals.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a simplified UI explorer for the Animal Kingdom.
+
+The UI must have a main 'Text' (usageHint 'h1') with the text "Simple Animal Explorer".
+
+Below the text heading, create a 'Tabs' component with exactly three tabs: "Mammals", "Birds", and "Reptiles".
+
+Each tab's content should be a 'Column'. The first item in each column must be a 'TextField' with the label "Search...". Below the search field, display the hierarchy for that tab using nested 'Card' components.
+
+The exact hierarchy to create is as follows:
+
+**1. "Mammals" Tab:**
+   - A 'Card' for the Class "Mammalia".
+   - Inside the "Mammalia" card, create two 'Card's for the following Orders:
+     - A 'Card' for the Order "Carnivora". Inside this, create 'Card's for these three species: "Lion", "Tiger", "Wolf".
+     - A 'Card' for the Order "Artiodactyla". Inside this, create 'Card's for these two species: "Giraffe", "Hippopotamus".
+
+**2. "Birds" Tab:**
+   - A 'Card' for the Class "Aves".
+   - Inside the "Aves" card, create three 'Card's for the following Orders:
+     - A 'Card' for the Order "Accipitriformes". Inside this, create a 'Card' for the species: "Bald Eagle".
+     - A 'Card' for the Order "Struthioniformes". Inside this, create a 'Card' for the species: "Ostrich".
+     - A 'Card' for the Order "Sphenisciformes". Inside this, create a 'Card' for the species: "Penguin".
+
+**3. "Reptiles" Tab:**
+   - A 'Card' for the Class "Reptilia".
+   - Inside the "Reptilia" card, create two 'Card's for the following Orders:
+     - A 'Card' for the Order "Crocodilia". Inside this, create a 'Card' for the species: "Nile Crocodile".
+     - A 'Card' for the Order "Squamata". Inside this, create 'Card's for these two species: "Komodo Dragon", "Ball Python".
+
+Each species card must contain a 'Row' with an 'Image' and a 'Text' component for the species name. Do not add any other components.
+
+Each Class and Order card must contain a 'Column' with a 'Text' component with the name, and then the children cards below.
+
+IMPORTANT: Do not skip any of the classes, orders, or species above. Include every item that is mentioned.
+`,
+  },
+  {
+    name: "recipeCard",
+    description: "A UI to display a recipe with ingredients and instructions.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a recipe card. It should have a 'Text' (usageHint 'h1') for the recipe title, "Classic Lasagna". Below the title, an 'Image' of the lasagna. Then, a 'Row' containing two 'Column's. The first column has a 'Text' (usageHint 'h2') "Ingredients" and a 'List' of ingredients (use 'Text' components for items: "Pasta", "Cheese", "Sauce"). The second column has a 'Text' (usageHint 'h2') "Instructions" and a 'List' of step-by-step instructions (use 'Text' components: "Boil pasta", "Layer ingredients", "Bake"). Finally, a 'Button' at the bottom labeled "Watch Video Tutorial".`,
+  },
+  {
+    name: "musicPlayer",
+    description: "A simple music player UI.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a music player. It should be a 'Card' containing a 'Column'. Inside the column, there's an 'Image' for the album art, a 'Text' for the song title "Bohemian Rhapsody", another 'Text' for the artist "Queen", a 'Slider' labeled "Progress", and a 'Row' with three 'Button' components. Each Button should have a child 'Text' component. The Text components should have the labels "Previous", "Play", and "Next" respectively.`,
+  },
+  {
+    name: "weatherForecast",
+    description: "A UI to display the weather forecast.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a weather forecast UI. It should have a 'Text' (usageHint 'h1') with the city name, "New York". Below it, a 'Row' with the current temperature as a 'Text' component ("68°F") and an 'Image' for the weather icon (e.g., a sun). Below that, a 'Divider'. Then, a 'List' component to display the 5-day forecast. Each item in the list should be a 'Row' with the day, an icon, and high/low temperatures.`,
+  },
+  {
+    name: "surveyForm",
+    description: "A customer feedback survey form.",
+    promptText: `Create a customer feedback survey form. It should have a 'Text' (usageHint 'h1') "Customer Feedback". Then a 'ChoicePicker' (usageHint 'mutuallyExclusive') with label "How would you rate our service?" and options "Excellent", "Good", "Average", "Poor". Then a 'ChoicePicker' (usageHint 'multipleSelection') with label "What did you like?" and options "Product Quality", "Price", "Customer Support". Finally, a 'TextField' with the label "Any other comments?" and a 'Button' labeled "Submit Feedback".`,
+  },
+  {
+    name: "flightBooker",
+    description: "A form to search for flights.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a flight booking form. It should have a 'Text' (usageHint 'h1') "Book a Flight". Then a 'Row' with two 'TextField's for "Origin" and "Destination". Below that, a 'Row' with two 'DateTimeInput's for "Departure Date" and "Return Date" (initialize with empty values). Add a 'Slider' labeled "Passengers" (min 1, max 10, value 1). Finally, a 'Button' labeled "Search Flights".`,
+  },
+  {
+    name: "dashboard",
+    description: "A simple dashboard with statistics.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a simple dashboard. It should have a 'Text' (usageHint 'h1') "Sales Dashboard". Below, a 'Row' containing three 'Card's. The first card has a 'Text' "Revenue" and another 'Text' "$50,000". The second card has "New Customers" and "1,200". The third card has "Conversion Rate" and "4.5%".`,
+  },
+  {
+    name: "contactCard",
+    description: "A UI to display contact information.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a contact card. It should be a 'Card' with a 'Row'. The row contains an 'Image' (as an avatar) and a 'Column'. The column contains a 'Text' for the name "Jane Doe", a 'Text' for the email "jane.doe@example.com", and a 'Text' for the phone number "(123) 456-7890". Below the main row, add a 'Button' labeled "View on Map".`,
+  },
+  {
+    name: "calendarEventCreator",
+    description: "A form to create a new calendar event.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a calendar event creation form. It should have a 'Text' (usageHint 'h1') "New Event". Include a 'TextField' for the "Event Title". Use a 'Row' for two 'DateTimeInput's for "Start Time" and "End Time" (initialize both with empty values). Add a 'CheckBox' labeled "All-day event". Finally, a 'Row' with two 'Button's: "Save" and "Cancel".`,
+  },
+  {
+    name: "checkoutPage",
+    description: "A simplified e-commerce checkout page.",
+    promptText: `Create a simplified e-commerce checkout page. It should have a 'Text' (usageHint 'h1') "Checkout". A 'Column' for shipping info with 'TextField's for "Name", "Address", "City", "Zip Code". A 'Column' for payment info with 'TextField's for "Card Number", "Expiry Date", "CVV". Finally, a 'Text' "Total: $99.99" and a 'Button' "Place Order".`,
+  },
+  {
+    name: "socialMediaPost",
+    description: "A component representing a social media post.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a social media post. It should be a 'Card' containing a 'Column'. The first item is a 'Row' with an 'Image' (user avatar) and a 'Text' (username "user123"). Below that, a 'Text' component for the post content: "Enjoying the beautiful weather today!". Then, an 'Image' for the main post picture. Finally, a 'Row' with three 'Button's: "Like", "Comment", and "Share".`,
+  },
+  {
+    name: "eCommerceProductPage",
+    description: "A detailed product page for an e-commerce website.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a product details page.
+The main layout should be a 'Row'.
+The left side of the row is a 'Column' containing a large main 'Image' of the product, and below it, a 'Row' of three smaller thumbnail 'Image' components.
+The right side of the row is another 'Column' for product information:
+- A 'Text' (usageHint 'h1') for the product name, "Premium Leather Jacket".
+- A 'Text' component for the price, "$299.99".
+- A 'Divider'.
+- A 'ChoicePicker' (usageHint 'mutuallyExclusive') labeled "Select Size" with options "S", "M", "L", "XL".
+- A 'ChoicePicker' (usageHint 'mutuallyExclusive') labeled "Select Color" with options "Black", "Brown", "Red".
+- A 'Button' with a 'Text' child "Add to Cart".
+- A 'Text' component for the product description below the button.`,
+  },
+  {
+    name: "interactiveDashboard",
+    description: "A dashboard with filters and data cards.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for an interactive analytics dashboard.
+  At the top, a 'Text' (usageHint 'h1') "Company Dashboard".
+  Below the text heading, a 'Card' containing a 'Row' of filter controls:
+  - A 'DateTimeInput' with a label for "Start Date" (initialize with empty value).
+  - A 'DateTimeInput' with a label for "End Date" (initialize with empty value).
+  - A 'Button' labeled "Apply Filters".
+  Below the filters card, a 'Row' containing two 'Card's for key metrics:
+  - The first 'Card' has a 'Text' (usageHint 'h2') "Total Revenue" and a 'Text' component showing "$1,234,567".
+  - The second 'Card' has a 'Text' (usageHint 'h2') "New Users" and a 'Text' component showing "4,321".
+  Finally, a large 'Card' at the bottom with a 'Text' (usageHint 'h2') "Revenue Over Time" and a placeholder 'Image' with a valid URL to represent a line chart.`,
+  },
+  {
+    name: "travelItinerary",
+    description: "A multi-day travel itinerary display.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a travel itinerary for a trip to Paris.
+It should have a main 'Text' component with usageHint 'h1' and text "Paris Adventure".
+Below, use a 'List' to display three days. Each item in the list should be a 'Card'.
+- The first 'Card' (Day 1) should contain a 'Text' (usageHint 'h2') "Day 1: Arrival & Eiffel Tower", and a 'List' of activities for that day: "Check into hotel", "Lunch at a cafe", "Visit the Eiffel Tower".
+- The second 'Card' (Day 2) should contain a 'Text' (usageHint 'h2') "Day 2: Museums & Culture", and a 'List' of activities: "Visit the Louvre Museum", "Walk through Tuileries Garden", "See the Arc de Triomphe".
+- The third 'Card' (Day 3) should contain a 'Text' (usageHint 'h2') "Day 3: Art & Departure", and a 'List' of activities: "Visit Musée d'Orsay", "Explore Montmartre", "Depart from CDG".
+Each activity in the inner lists should be a 'Row' containing a 'CheckBox' (to mark as complete) and a 'Text' component with the activity description.`,
+  },
+  {
+    name: "kanbanBoard",
+    description: "A Kanban-style task tracking board.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a Kanban board. It should have a 'Text' (usageHint 'h1') "Project Tasks". Below, a 'Row' containing three 'Column's representing "To Do", "In Progress", and "Done". Each column should have a 'Text' (usageHint 'h2') header and a list of 'Card's.
+    - "To Do" column: Card "Research", Card "Design".
+    - "In Progress" column: Card "Implementation".
+    - "Done" column: Card "Planning".
+    Each card should just contain a 'Text' with the task name.`,
+  },
+  {
+    name: "videoCallInterface",
+    description: "A video conference UI.",
+    promptText: `Create a video call interface. It should have a 'Text' (usageHint 'h1') "Video Call". A 'Video' component (placeholder URL). Below that, a 'Row' with three 'Button's labeled "Mute", "Camera", and "End Call".`,
+  },
+  {
+    name: "fileBrowser",
+    description: "A file explorer list.",
+    promptText: `Create a file browser. It should have a 'Text' (usageHint 'h1') "My Files". A 'List' of 'Row's. Each row has an 'Icon' (folder or attachFile) and a 'Text' (filename). Examples (create these as static rows, not data bound): "Documents", "Images", "Work.txt".`,
+  },
+  {
+    name: "chatRoom",
+    description: "A chat application interface.",
+    promptText: `Create a chat room interface. It should have a 'Column' for the message history. Inside, include several 'Card's representing messages, each with a 'Text' for the sender and a 'Text' for the message body. Specifically include these messages: "Alice: Hi there!", "Bob: Hello!". At the bottom, a 'Row' with a 'TextField' (label "Type a message...") and a 'Button' labeled "Send".`,
+  },
+  {
+    name: "fitnessTracker",
+    description: "A daily activity summary.",
+    promptText: `Create a fitness tracker dashboard. It should have a 'Text' (usageHint 'h1') "Daily Activity", and a 'Row' of 'Card's. Each card should contain a 'Column' with a 'Text' label (e.g. "Steps") and a 'Text' value (e.g. "10,000"). Create cards for "Steps" ("10,000"), "Calories" ("500 kcal"), "Distance" ("5 km"). Below that, a 'Slider' labeled "Daily Goal" (initialize value to 50). Finally, a 'List' of recent workouts. Use 'Text' components for the list items, for example: "Morning Run", "Evening Yoga", "Gym Session".`,
+  },
+  {
+    name: "smartHome",
+    description: "A smart home control panel.",
+    promptText: `Create a smart home dashboard. It should have a 'Text' (usageHint 'h1') "Living Room". A 'Grid' of 'Card's. To create the grid, use a 'Column' that contains multiple 'Row's. Each 'Row' should contain 'Card's. Create a row with cards for "Lights" (CheckBox, label "Lights", value true) and "Thermostat" (Slider, label "Thermostat", value 72). Create another row with a card for "Music" (CheckBox, label "Music", value false). Ensure the CheckBox labels are exactly "Lights" and "Music".`,
+  },
+  {
+    name: "restaurantMenu",
+    description: "A restaurant menu with tabs.",
+    promptText: `Create a restaurant menu with tabs. It should have a 'Text' (usageHint 'h1') "Gourmet Bistro". A 'Tabs' component with "Starters", "Mains", "Desserts".
+    - "Starters": 'List' containing IDs of separate 'Row' components (Name, Price). Create rows for "Soup - $8", "Salad - $10".
+    - "Mains": 'List' containing IDs of separate 'Row' components. Create rows for "Steak - $25", "Pasta - $18".
+    - "Desserts": 'List' containing IDs of separate 'Row' components. Create rows for "Cake - $8", "Pie - $7".`,
+  },
+  {
+    name: "newsAggregator",
+    description: "A news feed with article cards.",
+    promptText: `Create a news aggregator. The root component should be a 'Column'. Inside this column, place a 'Text' (usageHint 'h1') "Top Headlines". Below the text, place a 'List' of 'Card's. The 'List' should be a sibling of the 'Text', not a parent. Each card has a 'Column' with an 'Image', a 'Text' (headline), and a 'Text' (summary). Include headlines "Tech Breakthrough" and "Local Sports". Each card should have a 'Button' labeled "Read More". Create these as static components, not data bound.`,
+  },
+  {
+    name: "photoEditor",
+    description: "A photo editing interface with sliders.",
+    promptText: `Create a photo editor. It should have a large 'Image' (photo). Below it, a 'Row' of 'Button's (Filters, Crop, Adjust). Below that, a 'Slider' labeled "Intensity" (initialize value to 50).`,
+  },
+  {
+    name: "triviaQuiz",
+    description: "A trivia question card.",
+    promptText: `Create a trivia quiz. It should have a 'Text' (usageHint 'h1') "Question 1". A 'Text' "What is the capital of France?". A 'ChoicePicker' (usageHint 'mutuallyExclusive') for answers (options: "Paris", "London", "Berlin", "Madrid"). A 'Button' "Submit Answer".`,
+  },
+  {
+    name: "simpleCalculator",
+    description: "A basic calculator layout.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a calculator. It should have a 'Card'. Inside the card, there MUST be a single 'Column' that contains two things: a 'Text' (display) showing "0", and a nested 'Column' of 'Row's for the buttons.
+    - Row 1: "7", "8", "9", "/"
+    - Row 2: "4", "5", "6", "*"
+    - Row 3: "1", "2", "3", "-"
+    - Row 4: "0", ".", "=", "+"
+    Each button should be a 'Button' component.`,
+  },
+  {
+    name: "jobApplication",
+    description: "A job application form.",
+    promptText: `Create a job application form. It should have 'TextField's for "Name", "Email", "Phone", "Resume URL". A 'ChoicePicker' (usageHint 'mutuallyExclusive') labeled "Years of Experience" (options: "0-1", "2-5", "5+"). A 'Button' "Submit Application".`,
+  },
+  {
+    name: "courseSyllabus",
+    description: "A course syllabus outline.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a course syllabus. 'Text' (h1) "Introduction to Computer Science". 'List' of modules.
+- For module 1, a 'Card' with 'Text' "Algorithms" and 'List' ("Sorting", "Searching").
+- For module 2, a 'Card' with 'Text' "Data Structures" and 'List' ("Arrays", "Linked Lists").`,
+  },
+  {
+    name: "stockWatchlist",
+    description: "A stock market watchlist.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a stock watchlist. 'Text' (h1) "Market Watch". 'List' of 'Row's.
+    - Row 1: 'Text' "AAPL", 'Text' "$150.00", 'Text' "+1.2%".
+    - Row 2: 'Text' "GOOGL", 'Text' "$2800.00", 'Text' "-0.5%".
+    - Row 3: 'Text' "AMZN", 'Text' "$3400.00", 'Text' "+0.8%".`,
+  },
+  {
+    name: "podcastEpisode",
+    description: "A podcast player interface.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a podcast player. 'Card' containing:
+    - 'Image' (Cover Art).
+    - 'Text' (h2) "Episode 42: The Future of AI".
+    - 'Text' "Host: Jane Smith".
+    - 'Slider' labeled "Progress" (initialize value to 0).
+    - 'Row' with 'Button' (child 'Text' "1x"), 'Button' (child 'Text' "Play/Pause"), 'Button' (child 'Text' "Share").
+    Create these as static components, not data bound.`,
+  },
+  {
+    name: "hotelSearchResults",
+    description: "Hotel search results list.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for hotel search results. 'Text' (h1) "Hotels in Tokyo". 'List' of 'Card's.
+    - Card 1: 'Row' with 'Image', 'Column' ('Text' "Grand Hotel", 'Text' "5 Stars", 'Text' "$200/night"), 'Button' "Book".
+    - Card 2: 'Row' with 'Image', 'Column' ('Text' "City Inn", 'Text' "3 Stars", 'Text' "$100/night"), 'Button' "Book".`,
+  },
+  {
+    name: "notificationCenter",
+    description: "A list of notifications.",
+    promptText: `Create a notification center. It should have a 'Text' (usageHint 'h1') "Notifications". A 'List' of 'Card's. Include cards for "New message from Sarah" and "Your order has shipped". Each card should have a 'Button' "Dismiss".`,
+  },
+  {
+    name: "nestedDataBinding",
+    description: "A project dashboard with deeply nested data binding.",
+    promptText: `Generate a stream of JSON messages for a Project Management Dashboard.
+    The output must consist of exactly three JSON objects, one after the other.
+
+    Generate a createSurface message with surfaceId 'main'.
+    Generate an updateComponents message with surfaceId 'main'.
+    It should have a 'Text' (usageHint 'h1') "Project Dashboard".
+    Then a 'List' of projects bound to '/projects'.
+    Inside the list template, each item should be a 'Card' containing:
+    - A 'Text' (usageHint 'h2') bound to the project 'title'.
+    - A 'List' of tasks bound to the 'tasks' property of the project.
+    Inside the tasks list template, each item should be a 'Column' containing:
+    - A 'Text' bound to the task 'description'.
+    - A 'Row' for the assignee, containing:
+      - A 'Text' bound to 'assignee/name'.
+      - A 'Text' bound to 'assignee/role'.
+    - A 'List' of subtasks bound to 'subtasks'.
+    Inside the subtasks list template, each item should be a 'Text' bound to 'title'.
+
+    Then generate an 'updateDataModel' message.
+    Populate this dashboard with sample data:
+    - At least one project.
+    - The project should have a title, and a list of tasks.
+    - The task should have a description, an assignee object (with name and role), and a list of subtasks.`,
+  },
+
+  {
+    name: "profileEditor",
+    description: "A user profile editing form.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for editing a profile. 'Text' (h1) "Edit Profile". 'Image' (Current Avatar). 'Button' "Change Photo". 'TextField' "Display Name". 'TextField' "Bio" (multiline). 'TextField' "Website". 'Button' "Save Changes".`,
+  },
+  {
+    name: "cinemaSeatSelection",
+    description: "A seat selection grid.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for cinema seat selection. 'Text' (h1) "Select Seats". 'Text' "Screen" (centered). 'Column' of 'Row's representing rows of seats.
+    - Row A: 4 'CheckBox'es.
+    - Row B: 4 'CheckBox'es.
+    - Row C: 4 'CheckBox'es.
+    'Button' "Confirm Selection".`,
+  },
+  {
+    name: "flashcardApp",
+    description: "A language learning flashcard.",
+    promptText: `Generate a 'createSurface' message and a 'updateComponents' message with surfaceId 'main' for a flashcard app. 'Text' (h1) "Spanish Vocabulary". 'Card' (the flashcard). Inside the card, a 'Column' with 'Text' (h2) "Hola" (Front). 'Divider'. 'Text' "Hello" (Back - conceptually hidden, but rendered here). 'Row' of buttons: "Hard", "Good", "Easy".`,
+  },
+];
--- a/vendor/a2ui/specification/0.9/eval/src/rateLimiter.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/rateLimiter.ts
@@ -0,0 +1,205 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { logger } from "./logger";
+import { ModelConfiguration } from "./models";
+
+interface UsageRecord {
+  timestamp: number;
+  tokensUsed: number;
+  isRequest: boolean;
+}
+
+interface ModelRateLimitState {
+  usageRecords: UsageRecord[];
+}
+
+export class RateLimiter {
+  private modelStates: Map<string, ModelRateLimitState> = new Map();
+  private _waitingCount = 0;
+
+  private modelPauses: Map<string, number> = new Map();
+
+  get waitingCount(): number {
+    return this._waitingCount;
+  }
+
+  private getModelState(modelName: string): ModelRateLimitState {
+    if (!this.modelStates.has(modelName)) {
+      this.modelStates.set(modelName, { usageRecords: [] });
+    }
+    return this.modelStates.get(modelName)!;
+  }
+
+  private cleanUpRecords(state: ModelRateLimitState): void {
+    // Use 65 seconds to be safe against clock drift and server bucket alignment
+    const minuteAgo = Date.now() - 65 * 1000;
+    state.usageRecords = state.usageRecords.filter(
+      (record) => record.timestamp > minuteAgo
+    );
+  }
+
+  reportError(modelConfig: ModelConfiguration, error: any): void {
+    const isResourceExhausted =
+      error?.status === "RESOURCE_EXHAUSTED" ||
+      error?.code === 429 ||
+      (error?.message && error.message.includes("429"));
+
+    if (isResourceExhausted) {
+      // Try to parse "Please retry in X s" or similar from error message
+      // Example: "Please retry in 22.648565753s."
+      const message = error?.originalMessage || error?.message || "";
+      const match = message.match(/retry in ([0-9.]+)\s*s/i);
+
+      let retrySeconds = 60; // Default to 60s if not found
+      if (match && match[1]) {
+        retrySeconds = parseFloat(match[1]);
+      }
+
+      // Add a small buffer
+      const pauseDuration = Math.ceil(retrySeconds * 1000) + 1000;
+      const pausedUntil = Date.now() + pauseDuration;
+
+      this.modelPauses.set(modelConfig.name, pausedUntil);
+
+      logger.verbose(
+        `RateLimiter: Pausing ${modelConfig.name} for ${pauseDuration}ms due to 429 error. Resuming at ${new Date(pausedUntil).toISOString()}`
+      );
+    }
+  }
+
+  async acquirePermit(
+    modelConfig: ModelConfiguration,
+    tokensCost: number = 0
+  ): Promise<void> {
+    this._waitingCount++;
+    try {
+      const { name, requestsPerMinute, tokensPerMinute } = modelConfig;
+      if (!requestsPerMinute && !tokensPerMinute) {
+        return; // No limits
+      }
+
+      const state = this.getModelState(name);
+
+      // Loop to re-check after waiting, as multiple limits might be in play
+      while (true) {
+        // Check if model is paused globally due to 429
+        const pausedUntil = this.modelPauses.get(name);
+        if (pausedUntil && pausedUntil > Date.now()) {
+          const pauseWait = pausedUntil - Date.now();
+          logger.verbose(
+            `Rate limiting ${name}: Paused by circuit breaker for ${pauseWait}ms`
+          );
+          await new Promise((resolve) => setTimeout(resolve, pauseWait));
+          // After waiting, loop again to check normal rate limits
+          continue;
+        }
+
+        this.cleanUpRecords(state);
+        const currentNow = Date.now();
+        let rpmWait = 0;
+        let tpmWait = 0;
+
+        let currentTokens = 0;
+        let currentRequests = 0;
+        state.usageRecords.forEach((r) => {
+          currentTokens += r.tokensUsed;
+          if (r.isRequest) currentRequests++;
+        });
+
+        const effectiveTokensPerMinute = tokensPerMinute
+          ? Math.floor(tokensPerMinute * 0.9)
+          : 0;
+
+        logger.debug(
+          `RateLimiter check for ${name}: Cost=${tokensCost}, CurrentTokens=${currentTokens}, Limit=${effectiveTokensPerMinute}, Requests=${currentRequests}, RPM=${requestsPerMinute}`
+        );
+
+        // Check RPM
+        if (requestsPerMinute && currentRequests + 1 > requestsPerMinute) {
+          // Find the oldest REQUEST record
+          const oldestRequest = state.usageRecords.find((r) => r.isRequest);
+          if (oldestRequest) {
+            rpmWait = Math.max(
+              0,
+              oldestRequest.timestamp + 60 * 1000 - currentNow
+            );
+          }
+        }
+
+        // Check TPM
+        if (tokensPerMinute) {
+          // Apply a 10% safety buffer to the limit
+          const effectiveTokensPerMinute = Math.floor(tokensPerMinute * 0.9);
+
+          if (currentTokens + tokensCost > effectiveTokensPerMinute) {
+            // Check if we are ALREADY over limit for the next call
+            // We need to shed enough tokens so that (current - shed + cost) <= limit
+            // shed >= current + cost - limit
+            let tokensToShed =
+              currentTokens + tokensCost - effectiveTokensPerMinute;
+            let cumulativeTokens = 0;
+            for (const record of state.usageRecords) {
+              cumulativeTokens += record.tokensUsed;
+              if (cumulativeTokens >= tokensToShed) {
+                tpmWait = Math.max(
+                  tpmWait,
+                  record.timestamp + 60 * 1000 - currentNow
+                );
+                break;
+              }
+            }
+          }
+        }
+
+        const requiredWait = Math.max(rpmWait, tpmWait);
+        if (requiredWait <= 0) {
+          // RESERVE THE PERMIT HERE TO PREVENT RACE CONDITIONS
+          state.usageRecords.push({
+            timestamp: Date.now(),
+            tokensUsed: tokensCost,
+            isRequest: true,
+          });
+          break; // Permit acquired
+        }
+
+        logger.verbose(
+          `Rate limiting ${name}: Waiting ${requiredWait}ms (RPM wait: ${rpmWait}ms, TPM wait: ${tpmWait}ms)`
+        );
+        await new Promise((resolve) => setTimeout(resolve, requiredWait));
+      }
+    } finally {
+      this._waitingCount--;
+    }
+  }
+
+  recordUsage(
+    modelConfig: ModelConfiguration,
+    tokensUsed: number,
+    isRequest: boolean = true
+  ): void {
+    if (tokensUsed > 0 || isRequest) {
+      const state = this.getModelState(modelConfig.name);
+      state.usageRecords.push({
+        timestamp: Date.now(),
+        tokensUsed,
+        isRequest,
+      });
+    }
+  }
+}
+
+export const rateLimiter = new RateLimiter();
--- a/vendor/a2ui/specification/0.9/eval/src/types.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/types.ts
@@ -0,0 +1,47 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import { TestPrompt } from "./prompts";
+
+export interface GeneratedResult {
+  modelName: string;
+  prompt: TestPrompt;
+  runNumber: number;
+  rawText?: string;
+  components?: any[];
+  latency: number;
+  error?: any;
+}
+
+export interface ValidatedResult extends GeneratedResult {
+  validationErrors: string[];
+}
+
+export type IssueSeverity =
+  | "minor"
+  | "significant"
+  | "critical"
+  | "criticalSchema";
+
+export interface EvaluatedResult extends ValidatedResult {
+  evaluationResult?: {
+    pass: boolean;
+    reason: string;
+    issues?: { issue: string; severity: IssueSeverity }[];
+    overallSeverity?: IssueSeverity;
+    evalPrompt?: string;
+  };
+}
--- a/vendor/a2ui/specification/0.9/eval/src/utils.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/utils.ts
@@ -0,0 +1,44 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+export function extractJsonFromMarkdown(markdown: string): any[] {
+  const jsonBlockRegex = /```json\s*([\s\S]*?)\s*```/g;
+  const matches = [...markdown.matchAll(jsonBlockRegex)];
+  const results: any[] = [];
+
+  for (const match of matches) {
+    if (match[1]) {
+      const content = match[1].trim();
+      // Try parsing as a single JSON object first
+      try {
+        results.push(JSON.parse(content));
+      } catch (error) {
+        // If that fails, try parsing as JSONL (line by line)
+        const lines = content.split("\n");
+        for (const line of lines) {
+          if (line.trim()) {
+            try {
+              results.push(JSON.parse(line));
+            } catch (e2) {
+              // Ignore invalid lines
+            }
+          }
+        }
+      }
+    }
+  }
+  return results;
+}
--- a/vendor/a2ui/specification/0.9/eval/src/validator.ts
+++ b/vendor/a2ui/specification/0.9/eval/src/validator.ts
@@ -0,0 +1,365 @@
+/*
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+import Ajv from "ajv/dist/2020";
+import * as fs from "fs";
+import * as path from "path";
+import * as yaml from "js-yaml";
+
+import { GeneratedResult, ValidatedResult, IssueSeverity } from "./types";
+import { logger } from "./logger";
+
+export class Validator {
+  private ajv: Ajv;
+  private validateFn: any;
+
+  constructor(
+    private schemas: Record<string, any>,
+    private outputDir?: string
+  ) {
+    this.ajv = new Ajv({ allErrors: true, strict: false }); // strict: false to be lenient with unknown keywords if any
+    for (const [name, schema] of Object.entries(schemas)) {
+      this.ajv.addSchema(schema, name);
+    }
+    this.validateFn = this.ajv.getSchema(
+      "https://a2ui.dev/specification/0.9/server_to_client.json"
+    );
+  }
+
+  async run(results: GeneratedResult[]): Promise<ValidatedResult[]> {
+    logger.info(
+      `Starting Phase 2: Schema Validation (${results.length} items)`
+    );
+    const validatedResults: ValidatedResult[] = [];
+    let passedCount = 0;
+    let failedCount = 0;
+
+    // Phase 2 is fast (CPU bound), so we can just iterate.
+    // If we wanted to be fancy we could chunk it, but for < 1000 items it's instant.
+
+    for (const result of results) {
+      if (result.error || !result.components) {
+        validatedResults.push({ ...result, validationErrors: [] }); // Already failed generation
+        continue;
+      }
+
+      const errors: string[] = [];
+      const components = result.components;
+
+      // AJV Validation
+      // AJV Validation
+      if (this.ajv) {
+        for (const message of components) {
+          // Smart validation: check which key is present and validate against that specific definition
+          // to avoid noisy "oneOf" errors.
+          let validated = false;
+          const schemaUri =
+            "https://a2ui.dev/specification/0.9/server_to_client.json";
+
+          if (message.createSurface) {
+            validated = this.ajv.validate(
+              `${schemaUri}#/$defs/CreateSurfaceMessage`,
+              message
+            );
+          } else if (message.updateComponents) {
+            validated = this.ajv.validate(
+              `${schemaUri}#/$defs/UpdateComponentsMessage`,
+              message
+            );
+          } else if (message.updateDataModel) {
+            validated = this.ajv.validate(
+              `${schemaUri}#/$defs/UpdateDataModelMessage`,
+              message
+            );
+          } else if (message.deleteSurface) {
+            validated = this.ajv.validate(
+              `${schemaUri}#/$defs/DeleteSurfaceMessage`,
+              message
+            );
+          } else {
+            // Fallback to top-level validation if no known key matches (or if it's empty/invalid structure)
+            validated = this.validateFn(message);
+          }
+
+          if (!validated) {
+            errors.push(
+              ...(this.ajv.errors || []).map(
+                (err: any) => `${err.instancePath} ${err.message}`
+              )
+            );
+          }
+        }
+      }
+
+      // Custom Validation (Referential Integrity, etc.)
+      this.validateCustom(components, errors);
+
+      if (errors.length > 0) {
+        failedCount++;
+        if (this.outputDir) {
+          this.saveFailure(result, errors);
+        }
+      } else {
+        passedCount++;
+      }
+
+      validatedResults.push({
+        ...result,
+        validationErrors: errors,
+      });
+    }
+
+    logger.info(
+      `Phase 2: Validation Complete. Passed: ${passedCount}, Failed: ${failedCount}`
+    );
+    return validatedResults;
+  }
+
+  private saveFailure(result: GeneratedResult, errors: string[]) {
+    if (!this.outputDir) return;
+    const modelDir = path.join(
+      this.outputDir,
+      `output-${result.modelName.replace(/[\/:]/g, "_")}`
+    );
+    const detailsDir = path.join(modelDir, "details");
+    const failureData = {
+      pass: false,
+      reason: "Schema validation failure",
+      issues: errors.map((e) => ({
+        issue: e,
+        severity: "criticalSchema" as IssueSeverity,
+      })),
+      overallSeverity: "criticalSchema" as IssueSeverity,
+    };
+
+    fs.writeFileSync(
+      path.join(
+        detailsDir,
+        `${result.prompt.name}.${result.runNumber}.failed.yaml`
+      ),
+      yaml.dump(failureData)
+    );
+  }
+
+  private validateCustom(messages: any[], errors: string[]) {
+    let hasUpdateComponents = false;
+    let hasRootComponent = false;
+    const createdSurfaces = new Set<string>();
+
+    for (const message of messages) {
+      if (message.updateComponents) {
+        hasUpdateComponents = true;
+        const surfaceId = message.updateComponents.surfaceId;
+        if (surfaceId && !createdSurfaces.has(surfaceId)) {
+          errors.push(
+            `updateComponents message received for surface '${surfaceId}' before createSurface message.`
+          );
+        }
+
+        this.validateUpdateComponents(message.updateComponents, errors);
+
+        // Check for root component in this message
+        if (message.updateComponents.components) {
+          for (const comp of message.updateComponents.components) {
+            if (comp.id === "root") {
+              hasRootComponent = true;
+            }
+          }
+        }
+      } else if (message.createSurface) {
+        this.validateCreateSurface(message.createSurface, errors);
+        if (message.createSurface.surfaceId) {
+          createdSurfaces.add(message.createSurface.surfaceId);
+        }
+      } else if (message.updateDataModel) {
+        this.validateUpdateDataModel(message.updateDataModel, errors);
+      } else if (message.deleteSurface) {
+        this.validateDeleteSurface(message.deleteSurface, errors);
+      } else {
+        errors.push(
+          `Unknown message type in output: ${JSON.stringify(message)}`
+        );
+      }
+    }
+
+    // Algorithmic check for root component
+    if (hasUpdateComponents && !hasRootComponent) {
+      errors.push(
+        "Missing root component: At least one 'updateComponents' message must contain a component with id: 'root'."
+      );
+    }
+  }
+
+  // ... Copied helper functions ...
+  private validateCreateSurface(data: any, errors: string[]) {
+    if (data.surfaceId === undefined) {
+      errors.push("createSurface must have a 'surfaceId' property.");
+    }
+    if (data.catalogId === undefined) {
+      errors.push("createSurface must have a 'catalogId' property.");
+    }
+    const allowed = ["surfaceId", "catalogId"];
+    for (const key in data) {
+      if (!allowed.includes(key)) {
+        errors.push(`createSurface has unexpected property: ${key}`);
+      }
+    }
+  }
+
+  private validateDeleteSurface(data: any, errors: string[]) {
+    if (data.surfaceId === undefined) {
+      errors.push("DeleteSurface must have a 'surfaceId' property.");
+    }
+    const allowed = ["surfaceId"];
+    for (const key in data) {
+      if (!allowed.includes(key)) {
+        errors.push(`DeleteSurface has unexpected property: ${key}`);
+      }
+    }
+  }
+
+  private validateUpdateComponents(data: any, errors: string[]) {
+    if (data.surfaceId === undefined) {
+      errors.push("UpdateComponents must have a 'surfaceId' property.");
+    }
+    if (!data.components || !Array.isArray(data.components)) {
+      errors.push("UpdateComponents must have a 'components' array.");
+      return;
+    }
+
+    const componentIds = new Set<string>();
+    for (const c of data.components) {
+      const id = c.id;
+      if (id) {
+        if (componentIds.has(id)) {
+          errors.push(`Duplicate component ID found: ${id}`);
+        }
+        componentIds.add(id);
+      }
+
+      // Smart Component Validation
+      if (this.ajv && c.component) {
+        const componentType = c.component;
+        const schemaUri =
+          "https://a2ui.dev/specification/0.9/standard_catalog_definition.json";
+
+        const defRef = `${schemaUri}#/$defs/${componentType}`;
+
+        const valid = this.ajv.validate(defRef, c);
+        if (!valid) {
+          errors.push(
+            ...(this.ajv.errors || []).map(
+              (err: any) =>
+                `${err.instancePath} ${err.message} (in component '${
+                  c.id || "unknown"
+                }')`
+            )
+          );
+        }
+      }
+    }
+
+    for (const component of data.components) {
+      this.validateComponent(component, componentIds, errors);
+    }
+  }
+
+  private validateUpdateDataModel(data: any, errors: string[]) {
+    // Schema validation handles types, required fields (surfaceId, op), and extra properties.
+    // We only need to validate the conditional requirement of 'value' based on 'op'.
+
+    if (data.op === "remove") {
+      if (data.value !== undefined) {
+        errors.push(
+          "updateDataModel 'value' property must not be present when op is 'remove'."
+        );
+      }
+    } else {
+      // op is 'add' or 'replace' (schema validates enum values)
+      if (data.value === undefined) {
+        errors.push(
+          `updateDataModel 'value' property is required when op is '${data.op}'.`
+        );
+      }
+    }
+  }
+
+  private validateComponent(
+    component: any,
+    allIds: Set<string>,
+    errors: string[]
+  ) {
+    const id = component.id;
+    if (!id) {
+      errors.push(`Component is missing an 'id'.`);
+      return;
+    }
+
+    const componentType = component.component;
+    if (!componentType || typeof componentType !== "string") {
+      errors.push(`Component '${id}' is missing 'component' property.`);
+      return;
+    }
+
+    // Basic required checks that might be missed by AJV if it's lenient or if we want specific messages
+    // Actually AJV covers most of this, but the custom logic for 'children' and 'refs' is key.
+
+    const checkRefs = (ids: (string | undefined)[]) => {
+      for (const id of ids) {
+        if (id && !allIds.has(id)) {
+          errors.push(
+            `Component ${JSON.stringify(id)} references non-existent component ID.`
+          );
+        }
+      }
+    };
+
+    switch (componentType) {
+      case "Row":
+      case "Column":
+      case "List":
+        if (component.children) {
+          if (Array.isArray(component.children)) {
+            checkRefs(component.children);
+          } else if (
+            typeof component.children === "object" &&
+            component.children !== null
+          ) {
+            if (component.children.componentId) {
+              checkRefs([component.children.componentId]);
+            }
+          }
+        }
+        break;
+      case "Card":
+        checkRefs([component.child]);
+        break;
+      case "Tabs":
+        if (component.tabItems && Array.isArray(component.tabItems)) {
+          component.tabItems.forEach((tab: any) => {
+            checkRefs([tab.child]);
+          });
+        }
+        break;
+      case "Modal":
+        checkRefs([component.entryPointChild, component.contentChild]);
+        break;
+      case "Button":
+        checkRefs([component.child]);
+        break;
+    }
+  }
+}