clawdbot/extensions/open-prose/skills/prose/examples/45-run-endpoint-ux-test-with-remediation.prose

# /run Endpoint UX Test with Error Remediation
#
# A multi-agent observation protocol for qualitative UX testing of the
# OpenProse /run endpoint, WITH automated error investigation and remediation.
#
# This extends the basic UX test with a comprehensive error handling pipeline:
# - If blocking errors are detected, investigate using logs, database, and code
# - Verify diagnosis through synthesis loop
# - Triage: quick fix vs. bigger change requiring CEO oversight
# - Quick fixes: engineer implements, deploys, tests, iterates
# - Bigger changes: build plan, parallel engineers, review, deploy, smoke test
#
# Key patterns demonstrated:
# - Mid-program `input` for user checkpoints
# - Persistent agents with `resume:` for accumulated context
# - Parallel investigation with multiple angles
# - `choice` blocks for triage decisions
# - `retry` with backoff for flaky operations
# - Recursive self-healing (if fix fails, re-test)

# Default test program (simple hello world)
const test_program = """
# Quick Hello
session "Say hello and count to 5"
"""

# Auto-auth: Read credentials from .env.test and fetch token
let api_url = session "Read API URL"
  prompt: """Read the TEST_API_URL from .env.test and return just the URL.
If not found, default to: https://api-v2.prose.md"""

let auth_token = session "Authenticate"
  prompt: """Read credentials from .env.test (TEST_EMAIL, TEST_PASSWORD).
Then POST to {api_url}/auth/login with these credentials.
Return just the token value (no Bearer prefix)."""
  context: api_url

# ============================================================================
# Agent Definitions
# ============================================================================

# --- Observation Team ---

agent ws_observer:
  model: opus
  persist: true
  prompt: """You are a UX researcher observing an OpenProse program execution.

Your job is to watch the WebSocket execution stream and evaluate the experience
from a USER's perspective - not as an engineer checking correctness.

Focus on:
- Latency and responsiveness (does it FEEL fast?)
- Clarity of status transitions (does the user know what's happening?)
- Quality of streamed events (are they informative? overwhelming? sparse?)
- Error messages (helpful or cryptic?)
- Overall flow (smooth or jarring?)

Log your raw observations, then periodically synthesize into user feedback.
Think: "If I were a first-time user, what would I think right now?"
"""

agent file_observer:
  model: opus
  persist: true
  prompt: """You are a UX researcher monitoring the file system during execution.

Your job is to observe how the filesystem changes as a program runs, evaluating
whether the state management would make sense to a user browsing files.

Focus on:
- Directory structure clarity (can a user understand what's where?)
- File naming conventions (self-documenting or cryptic?)
- State file contents (readable? useful for debugging?)
- Timing of file creation/modification (predictable?)
- What a file browser UI should show

You will poll periodically and note changes between snapshots.
"""

agent synthesizer:
  model: opus
  prompt: """You are a senior UX researcher synthesizing observations from
multiple sources into prioritized, actionable feedback.

Your output should be:
1. Correlated findings (where did both observers notice the same thing?)
2. Prioritized action items (high/medium/low)
3. Specific quotes/evidence supporting each finding
4. Recommendations that are concrete and implementable

Be direct. "The loading state is confusing" not "Consider potentially improving..."

IMPORTANT: At the end of your synthesis, include:

## Error Classification
blocking_error: true/false
error_summary: "One-line description of the blocking error, if any"
"""

# --- Remediation Team ---

agent researcher:
  model: opus
  persist: true
  prompt: """You are a senior engineer investigating a production error.

Your job is to diagnose the ROOT CAUSE of errors by:
1. Reading relevant log files
2. Querying the database for related records
3. Examining the source code that produced the error
4. Tracing the execution path

Be thorough but focused. Follow the evidence. Don't speculate without data.

Output a structured diagnosis:
- Error symptom: What the user/system observed
- Root cause: The underlying technical issue
- Evidence: Specific logs, code, or data supporting your diagnosis
- Confidence: High/Medium/Low
- Affected components: Which files/services are involved
"""

agent diagnosis_verifier:
  model: opus
  prompt: """You are a staff engineer verifying a diagnosis.

Your job is to critically evaluate a proposed diagnosis by:
1. Checking if the evidence actually supports the conclusion
2. Looking for alternative explanations
3. Verifying the logic chain from symptom to root cause
4. Identifying gaps in the investigation

Be skeptical but fair. A good diagnosis should be:
- Supported by concrete evidence (not just plausible)
- Specific (not vague like "something went wrong")
- Actionable (points to what needs to be fixed)

Output:
- diagnosis_sound: true/false
- critique: What's wrong or missing (if not sound)
- follow_up_questions: What the researcher should investigate (if not sound)
- approved_diagnosis: The verified diagnosis (if sound)
"""

agent triage_expert:
  model: opus
  prompt: """You are a tech lead triaging a diagnosed bug.

Evaluate the diagnosis and categorize the fix:

QUICK FIX criteria (ALL must be true):
- Isolated bug affecting < 3 files
- No architectural changes required
- No API contract changes
- No security implications
- Estimated effort < 1 hour
- Low risk of regression

BIGGER CHANGE criteria (ANY triggers this):
- Affects > 3 files or multiple services
- Requires architectural decisions
- Changes API contracts or data models
- Has security implications
- Requires CEO/stakeholder input
- High risk of regression
- Unclear solution path

Output:
- triage_decision: "quick_fix" or "bigger_change"
- rationale: Why this classification
- risk_assessment: What could go wrong
- recommended_approach: High-level fix strategy
"""

agent engineer:
  model: opus
  persist: true
  prompt: """You are a senior engineer implementing a fix.

Your job is to:
1. Understand the diagnosis and recommended approach
2. Write clean, tested code that fixes the issue
3. Follow existing patterns in the codebase
4. Create atomic commits with clear messages
5. Verify the fix works

Do not over-engineer. Fix the issue directly and simply.
Follow the project's coding standards and testing patterns.
"""

agent build_planner:
  model: opus
  prompt: """You are a software architect creating a build plan.

Follow the standards in docs/PLANNING_BEST_PRACTICES.md:
- Break work into self-contained phases
- Each phase should be testable and committable
- Identify parallel work where possible
- Define clear verification criteria
- Plan for rollback

Output a structured plan with:
- Phases (numbered, with dependencies)
- Tasks per phase
- Verification steps
- Commit strategy
- Risk mitigation
"""

agent reviewer:
  model: opus
  prompt: """You are a senior engineer reviewing a fix.

Evaluate the implementation by:
1. Checking git diff against the original diagnosis
2. Verifying the fix addresses the root cause
3. Looking for regressions or side effects
4. Checking test coverage
5. Reviewing code quality and patterns

Be thorough but not nitpicky. Focus on correctness and safety.

Output:
- review_approved: true/false
- issues: List of blocking issues (if not approved)
- suggestions: Non-blocking improvements
- confidence: How confident are you the fix is correct
"""

agent smoke_tester:
  model: opus
  prompt: """You are a QA engineer performing post-deployment verification.

Follow the procedures in docs/MONITORING.md to verify:
1. Health endpoints are responding
2. The specific bug is fixed
3. No new errors in logs
4. Key metrics are stable

Output:
- smoke_test_passed: true/false
- checks_performed: List of verifications done
- issues_found: Any problems discovered
- recommendations: Monitoring or follow-up suggestions
"""

# ============================================================================
# Blocks: Observation
# ============================================================================

block observe_websocket(ws_url, token, program):
  session: ws_observer
    prompt: """Connect to the WebSocket at:
{ws_url}&token={token}

Once connected, send the execute message:
{"type":"execute","program":<the program>}

Program:
```
{program}
```

Log your initial connection experience."""

  loop until **execution completed (received status: completed/failed/aborted)**:
    resume: ws_observer
      prompt: """Continue observing the WebSocket stream.

Log each message with timestamp, type, content, and your interpretation.
After every 3-5 messages, synthesize: what would a user be thinking?"""

  output ws_feedback = resume: ws_observer
    prompt: """The execution has completed. Write your final assessment:
1. Total duration and event count
2. Status transitions observed
3. What worked well from a UX perspective
4. Pain points and confusion
5. Top 3 recommendations"""

block observe_filesystem(env_id, api_url, token):
  session: file_observer
    prompt: """Fetch the initial file tree:
GET {api_url}/environments/{env_id}/files/tree?depth=3
Authorization: Bearer {token}

Log the baseline directory structure."""
    permissions:
      network: ["{api_url}/*"]

  let snapshot_count = 0

  loop until **websocket observer signals completion** (max: 30):
    let snapshot_count = snapshot_count + 1

    resume: file_observer
      prompt: """Snapshot #{snapshot_count}: Fetch and compare file tree.
Log what's NEW, MODIFIED, and any interesting state files to read."""
      permissions:
        network: ["{api_url}/*"]

  output file_feedback = resume: file_observer
    prompt: """Final filesystem assessment:
1. Total snapshots taken
2. Files created during execution
3. State file clarity
4. Top 3 recommendations"""

# ============================================================================
# Blocks: Investigation
# ============================================================================

block investigate_error(error_summary, ws_results, file_results, exec_info):
  # Parallel investigation from multiple angles
  parallel:
    code_analysis = session: researcher
      prompt: """Investigate the CODE PATH for this error:

ERROR: {error_summary}

Search the codebase for:
1. The execution logic that produced this error
2. Error handling paths
3. Recent changes to related code (git log)

Focus on understanding HOW this error was produced."""
      permissions:
        filesystem: ["read"]

    log_analysis = session: researcher
      prompt: """Investigate the LOGS for this error:

ERROR: {error_summary}

WebSocket observations:
{ws_results}

File explorer observations:
{file_results}

Look for:
1. Error messages and stack traces
2. Timing of events
3. Any warnings before the error"""
      context: { ws_results, file_results }

    context_analysis = session: researcher
      prompt: """Investigate the EXECUTION CONTEXT:

ERROR: {error_summary}

Execution info:
{exec_info}

Check:
1. Environment state
2. Database records for this execution
3. Any configuration issues"""
      context: exec_info
      permissions:
        database: ["read"]

  # Synthesize findings from all angles
  output diagnosis = resume: researcher
    prompt: """Synthesize your parallel investigations into a unified diagnosis:

Code analysis: {code_analysis}
Log analysis: {log_analysis}
Context analysis: {context_analysis}

Provide:
- Root cause (specific and actionable)
- Evidence chain
- Confidence level
- Affected components"""
    context: { code_analysis, log_analysis, context_analysis }

block verify_diagnosis(diagnosis, original_error, ws_results):
  output verification = session: diagnosis_verifier
    prompt: """Verify this diagnosis:

DIAGNOSIS:
{diagnosis}

ORIGINAL ERROR:
{original_error}

OBSERVATIONS:
{ws_results}

Is this diagnosis sound? If not, what's missing?"""
    context: { diagnosis, ws_results }

# ============================================================================
# Blocks: Remediation
# ============================================================================

block quick_fix_cycle(diagnosis, triage):
  # Implement the fix
  let fix = session: engineer
    prompt: """Implement a fix for:

DIAGNOSIS: {diagnosis}
APPROACH: {triage.recommended_approach}

Make the smallest change that fixes the issue.
Commit with: fix(scope): description"""
    permissions:
      filesystem: ["read", "write"]

  # Review loop
  loop until **review approved** (max: 3):
    let review = session: reviewer
      prompt: """Review this fix:

DIAGNOSIS: {diagnosis}
IMPLEMENTATION: {fix}

Does it address the root cause? Any regressions?"""
      context: { diagnosis, fix }

    if **review has blocking issues**:
      let fix = resume: engineer
        prompt: """Address review feedback:

{review.issues}

Update your fix accordingly."""
        context: review
        permissions:
          filesystem: ["read", "write"]

  output fix_result = { fix, review }

block deploy_and_verify(fix_result):
  # Deploy with retry
  let deploy = session "Deploy fix"
    prompt: """Deploy following docs/DEPLOYMENT.md.
Verify deployment succeeded."""
    retry: 3
    backoff: exponential
    permissions:
      network: ["*"]

  # Smoke test
  let smoke = session: smoke_tester
    prompt: """Post-deployment verification per docs/MONITORING.md:
1. Health endpoints
2. Verify bug is fixed
3. Check for new errors"""

  output deploy_result = { deploy, smoke, success: **smoke test passed** }

block bigger_change_flow(diagnosis, triage):
  # Build the plan
  let plan = session: build_planner
    prompt: """Create a build plan for:

DIAGNOSIS: {diagnosis}
TRIAGE: {triage}

Follow docs/PLANNING_BEST_PRACTICES.md."""
    context:
      file: "docs/PLANNING_BEST_PRACTICES.md"

  # User approval of plan
  input plan_approval: **
    Build plan created:
    {plan}

    Approve and execute?
  **

  if plan_approval != "approve":
    output change_result = { success: false, reason: plan_approval, plan }
    return

  # Execute phases (parallel where possible)
  let phase_results = plan.phases
    | pmap:
        session: engineer
          prompt: """Execute phase:
{item.name}
{item.tasks}

Complete tasks, run verification, commit."""
          permissions:
            filesystem: ["read", "write"]

  # Final review
  let review = session: reviewer
    prompt: """Review complete implementation:

PLAN: {plan}
RESULTS: {phase_results}

All phases complete? Root cause addressed?"""
    context: { plan, phase_results }

  if **review not approved**:
    output change_result = { success: false, reason: "Review failed", review }
    return

  # Deploy
  let deploy_result = do deploy_and_verify({ fix: phase_results, review })

  output change_result = {
    success: deploy_result.success,
    plan,
    phases: phase_results,
    review,
    deploy: deploy_result
  }

# ============================================================================
# Main Workflow
# ============================================================================

# Phase 1: Setup
let exec = session "Execute POST /run"
  prompt: """POST to {api_url}/run with the test program.
Return executionId, environmentId, wsUrl."""
  permissions:
    network: ["{api_url}/*"]

session "Log test configuration"
  prompt: """Log: timestamp, API URL, execution/environment IDs, program snippet."""
  context: exec

# Phase 2: Parallel Observation
parallel:
  ws_results = do observe_websocket(exec.wsUrl, auth_token, test_program)
  file_results = do observe_filesystem(exec.environmentId, api_url, auth_token)

# Phase 3: Synthesis
let synthesis = session: synthesizer
  prompt: """Synthesize observations into UX assessment.

WebSocket: {ws_results}
File Explorer: {file_results}

Include error classification at the end."""
  context: { ws_results, file_results, exec }

# Phase 4: Error Remediation (if needed)
if **blocking error detected in synthesis**:

  # User checkpoint: investigate?
  input investigate_decision: **
    Blocking error detected:
    {synthesis.error_summary}

    Investigate and attempt remediation?
  **

  if investigate_decision == "skip":
    output final_result = { test_results: synthesis, remediation: "skipped" }

  elif investigate_decision == "investigate only":
    let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
    output final_result = { test_results: synthesis, diagnosis, remediation: "investigation only" }

  else:
    # Full remediation flow
    let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)

    # Verification loop
    loop until **diagnosis verified** (max: 3):
      let verification = do verify_diagnosis(diagnosis, synthesis.error_summary, ws_results)

      if verification.diagnosis_sound:
        break
      else:
        let diagnosis = resume: researcher
          prompt: """Diagnosis needs refinement:

{verification.critique}

Investigate: {verification.follow_up_questions}"""

    # User checkpoint: confirm diagnosis before action
    input diagnosis_confirmation: **
      Diagnosis verified:
      {diagnosis}

      Proceed to triage and remediation?
    **

    if diagnosis_confirmation != "proceed":
      output final_result = { test_results: synthesis, diagnosis, remediation: diagnosis_confirmation }

    else:
      # Triage
      let triage = session: triage_expert
        prompt: """Triage this bug: {diagnosis}"""
        context: diagnosis

      # Route based on triage
      choice **triage decision**:
        option "Quick fix":
          let fix_result = do quick_fix_cycle(diagnosis, triage)

          # User checkpoint before deploy
          input deploy_decision: **
            Fix implemented and reviewed:
            {fix_result}

            Deploy to production?
          **

          if deploy_decision == "deploy":
            let deploy_result = do deploy_and_verify(fix_result)

            if not deploy_result.success:
              # Recursive: re-run test to verify or catch new issues
              input retry_decision: **
                Deployment or smoke test failed.
                Re-run the full test to diagnose new issues?
              **

              if retry_decision == "yes":
                # Note: This would re-invoke the program - true self-healing
                session "Log: Triggering re-test after failed deployment"

            output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: deploy_result }
          else:
            output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: "skipped" }

        option "Bigger change":
          # CEO checkpoint is built into bigger_change_flow
          let change_result = do bigger_change_flow(diagnosis, triage)
          output final_result = { test_results: synthesis, diagnosis, triage, change: change_result }

else:
  # No blocking error
  output final_result = { test_results: synthesis, remediation: "none needed" }