feat: add OpenProse plugin skills

2026-01-23 00:49:32 +00:00
parent db0235a26a
commit 51a9053387
102 changed files with 23315 additions and 5 deletions
--- a/extensions/open-prose/skills/prose/examples/45-run-endpoint-ux-test-with-remediation.prose
+++ b/extensions/open-prose/skills/prose/examples/45-run-endpoint-ux-test-with-remediation.prose
@@ -0,0 +1,637 @@
+# /run Endpoint UX Test with Error Remediation
+#
+# A multi-agent observation protocol for qualitative UX testing of the
+# OpenProse /run endpoint, WITH automated error investigation and remediation.
+#
+# This extends the basic UX test with a comprehensive error handling pipeline:
+# - If blocking errors are detected, investigate using logs, database, and code
+# - Verify diagnosis through synthesis loop
+# - Triage: quick fix vs. bigger change requiring CEO oversight
+# - Quick fixes: engineer implements, deploys, tests, iterates
+# - Bigger changes: build plan, parallel engineers, review, deploy, smoke test
+#
+# Key patterns demonstrated:
+# - Mid-program `input` for user checkpoints
+# - Persistent agents with `resume:` for accumulated context
+# - Parallel investigation with multiple angles
+# - `choice` blocks for triage decisions
+# - `retry` with backoff for flaky operations
+# - Recursive self-healing (if fix fails, re-test)
+
+# Default test program (simple hello world)
+const test_program = """
+# Quick Hello
+session "Say hello and count to 5"
+"""
+
+# Auto-auth: Read credentials from .env.test and fetch token
+let api_url = session "Read API URL"
+  prompt: """Read the TEST_API_URL from .env.test and return just the URL.
+If not found, default to: https://api-v2.prose.md"""
+
+let auth_token = session "Authenticate"
+  prompt: """Read credentials from .env.test (TEST_EMAIL, TEST_PASSWORD).
+Then POST to {api_url}/auth/login with these credentials.
+Return just the token value (no Bearer prefix)."""
+  context: api_url
+
+# ============================================================================
+# Agent Definitions
+# ============================================================================
+
+# --- Observation Team ---
+
+agent ws_observer:
+  model: opus
+  persist: true
+  prompt: """You are a UX researcher observing an OpenProse program execution.
+
+Your job is to watch the WebSocket execution stream and evaluate the experience
+from a USER's perspective - not as an engineer checking correctness.
+
+Focus on:
+- Latency and responsiveness (does it FEEL fast?)
+- Clarity of status transitions (does the user know what's happening?)
+- Quality of streamed events (are they informative? overwhelming? sparse?)
+- Error messages (helpful or cryptic?)
+- Overall flow (smooth or jarring?)
+
+Log your raw observations, then periodically synthesize into user feedback.
+Think: "If I were a first-time user, what would I think right now?"
+"""
+
+agent file_observer:
+  model: opus
+  persist: true
+  prompt: """You are a UX researcher monitoring the file system during execution.
+
+Your job is to observe how the filesystem changes as a program runs, evaluating
+whether the state management would make sense to a user browsing files.
+
+Focus on:
+- Directory structure clarity (can a user understand what's where?)
+- File naming conventions (self-documenting or cryptic?)
+- State file contents (readable? useful for debugging?)
+- Timing of file creation/modification (predictable?)
+- What a file browser UI should show
+
+You will poll periodically and note changes between snapshots.
+"""
+
+agent synthesizer:
+  model: opus
+  prompt: """You are a senior UX researcher synthesizing observations from
+multiple sources into prioritized, actionable feedback.
+
+Your output should be:
+1. Correlated findings (where did both observers notice the same thing?)
+2. Prioritized action items (high/medium/low)
+3. Specific quotes/evidence supporting each finding
+4. Recommendations that are concrete and implementable
+
+Be direct. "The loading state is confusing" not "Consider potentially improving..."
+
+IMPORTANT: At the end of your synthesis, include:
+
+## Error Classification
+blocking_error: true/false
+error_summary: "One-line description of the blocking error, if any"
+"""
+
+# --- Remediation Team ---
+
+agent researcher:
+  model: opus
+  persist: true
+  prompt: """You are a senior engineer investigating a production error.
+
+Your job is to diagnose the ROOT CAUSE of errors by:
+1. Reading relevant log files
+2. Querying the database for related records
+3. Examining the source code that produced the error
+4. Tracing the execution path
+
+Be thorough but focused. Follow the evidence. Don't speculate without data.
+
+Output a structured diagnosis:
+- Error symptom: What the user/system observed
+- Root cause: The underlying technical issue
+- Evidence: Specific logs, code, or data supporting your diagnosis
+- Confidence: High/Medium/Low
+- Affected components: Which files/services are involved
+"""
+
+agent diagnosis_verifier:
+  model: opus
+  prompt: """You are a staff engineer verifying a diagnosis.
+
+Your job is to critically evaluate a proposed diagnosis by:
+1. Checking if the evidence actually supports the conclusion
+2. Looking for alternative explanations
+3. Verifying the logic chain from symptom to root cause
+4. Identifying gaps in the investigation
+
+Be skeptical but fair. A good diagnosis should be:
+- Supported by concrete evidence (not just plausible)
+- Specific (not vague like "something went wrong")
+- Actionable (points to what needs to be fixed)
+
+Output:
+- diagnosis_sound: true/false
+- critique: What's wrong or missing (if not sound)
+- follow_up_questions: What the researcher should investigate (if not sound)
+- approved_diagnosis: The verified diagnosis (if sound)
+"""
+
+agent triage_expert:
+  model: opus
+  prompt: """You are a tech lead triaging a diagnosed bug.
+
+Evaluate the diagnosis and categorize the fix:
+
+QUICK FIX criteria (ALL must be true):
+- Isolated bug affecting < 3 files
+- No architectural changes required
+- No API contract changes
+- No security implications
+- Estimated effort < 1 hour
+- Low risk of regression
+
+BIGGER CHANGE criteria (ANY triggers this):
+- Affects > 3 files or multiple services
+- Requires architectural decisions
+- Changes API contracts or data models
+- Has security implications
+- Requires CEO/stakeholder input
+- High risk of regression
+- Unclear solution path
+
+Output:
+- triage_decision: "quick_fix" or "bigger_change"
+- rationale: Why this classification
+- risk_assessment: What could go wrong
+- recommended_approach: High-level fix strategy
+"""
+
+agent engineer:
+  model: opus
+  persist: true
+  prompt: """You are a senior engineer implementing a fix.
+
+Your job is to:
+1. Understand the diagnosis and recommended approach
+2. Write clean, tested code that fixes the issue
+3. Follow existing patterns in the codebase
+4. Create atomic commits with clear messages
+5. Verify the fix works
+
+Do not over-engineer. Fix the issue directly and simply.
+Follow the project's coding standards and testing patterns.
+"""
+
+agent build_planner:
+  model: opus
+  prompt: """You are a software architect creating a build plan.
+
+Follow the standards in docs/PLANNING_BEST_PRACTICES.md:
+- Break work into self-contained phases
+- Each phase should be testable and committable
+- Identify parallel work where possible
+- Define clear verification criteria
+- Plan for rollback
+
+Output a structured plan with:
+- Phases (numbered, with dependencies)
+- Tasks per phase
+- Verification steps
+- Commit strategy
+- Risk mitigation
+"""
+
+agent reviewer:
+  model: opus
+  prompt: """You are a senior engineer reviewing a fix.
+
+Evaluate the implementation by:
+1. Checking git diff against the original diagnosis
+2. Verifying the fix addresses the root cause
+3. Looking for regressions or side effects
+4. Checking test coverage
+5. Reviewing code quality and patterns
+
+Be thorough but not nitpicky. Focus on correctness and safety.
+
+Output:
+- review_approved: true/false
+- issues: List of blocking issues (if not approved)
+- suggestions: Non-blocking improvements
+- confidence: How confident are you the fix is correct
+"""
+
+agent smoke_tester:
+  model: opus
+  prompt: """You are a QA engineer performing post-deployment verification.
+
+Follow the procedures in docs/MONITORING.md to verify:
+1. Health endpoints are responding
+2. The specific bug is fixed
+3. No new errors in logs
+4. Key metrics are stable
+
+Output:
+- smoke_test_passed: true/false
+- checks_performed: List of verifications done
+- issues_found: Any problems discovered
+- recommendations: Monitoring or follow-up suggestions
+"""
+
+# ============================================================================
+# Blocks: Observation
+# ============================================================================
+
+block observe_websocket(ws_url, token, program):
+  session: ws_observer
+    prompt: """Connect to the WebSocket at:
+{ws_url}&token={token}
+
+Once connected, send the execute message:
+{"type":"execute","program":<the program>}
+
+Program:
+```
+{program}
+```
+
+Log your initial connection experience."""
+
+  loop until **execution completed (received status: completed/failed/aborted)**:
+    resume: ws_observer
+      prompt: """Continue observing the WebSocket stream.
+
+Log each message with timestamp, type, content, and your interpretation.
+After every 3-5 messages, synthesize: what would a user be thinking?"""
+
+  output ws_feedback = resume: ws_observer
+    prompt: """The execution has completed. Write your final assessment:
+1. Total duration and event count
+2. Status transitions observed
+3. What worked well from a UX perspective
+4. Pain points and confusion
+5. Top 3 recommendations"""
+
+block observe_filesystem(env_id, api_url, token):
+  session: file_observer
+    prompt: """Fetch the initial file tree:
+GET {api_url}/environments/{env_id}/files/tree?depth=3
+Authorization: Bearer {token}
+
+Log the baseline directory structure."""
+    permissions:
+      network: ["{api_url}/*"]
+
+  let snapshot_count = 0
+
+  loop until **websocket observer signals completion** (max: 30):
+    let snapshot_count = snapshot_count + 1
+
+    resume: file_observer
+      prompt: """Snapshot #{snapshot_count}: Fetch and compare file tree.
+Log what's NEW, MODIFIED, and any interesting state files to read."""
+      permissions:
+        network: ["{api_url}/*"]
+
+  output file_feedback = resume: file_observer
+    prompt: """Final filesystem assessment:
+1. Total snapshots taken
+2. Files created during execution
+3. State file clarity
+4. Top 3 recommendations"""
+
+# ============================================================================
+# Blocks: Investigation
+# ============================================================================
+
+block investigate_error(error_summary, ws_results, file_results, exec_info):
+  # Parallel investigation from multiple angles
+  parallel:
+    code_analysis = session: researcher
+      prompt: """Investigate the CODE PATH for this error:
+
+ERROR: {error_summary}
+
+Search the codebase for:
+1. The execution logic that produced this error
+2. Error handling paths
+3. Recent changes to related code (git log)
+
+Focus on understanding HOW this error was produced."""
+      permissions:
+        filesystem: ["read"]
+
+    log_analysis = session: researcher
+      prompt: """Investigate the LOGS for this error:
+
+ERROR: {error_summary}
+
+WebSocket observations:
+{ws_results}
+
+File explorer observations:
+{file_results}
+
+Look for:
+1. Error messages and stack traces
+2. Timing of events
+3. Any warnings before the error"""
+      context: { ws_results, file_results }
+
+    context_analysis = session: researcher
+      prompt: """Investigate the EXECUTION CONTEXT:
+
+ERROR: {error_summary}
+
+Execution info:
+{exec_info}
+
+Check:
+1. Environment state
+2. Database records for this execution
+3. Any configuration issues"""
+      context: exec_info
+      permissions:
+        database: ["read"]
+
+  # Synthesize findings from all angles
+  output diagnosis = resume: researcher
+    prompt: """Synthesize your parallel investigations into a unified diagnosis:
+
+Code analysis: {code_analysis}
+Log analysis: {log_analysis}
+Context analysis: {context_analysis}
+
+Provide:
+- Root cause (specific and actionable)
+- Evidence chain
+- Confidence level
+- Affected components"""
+    context: { code_analysis, log_analysis, context_analysis }
+
+block verify_diagnosis(diagnosis, original_error, ws_results):
+  output verification = session: diagnosis_verifier
+    prompt: """Verify this diagnosis:
+
+DIAGNOSIS:
+{diagnosis}
+
+ORIGINAL ERROR:
+{original_error}
+
+OBSERVATIONS:
+{ws_results}
+
+Is this diagnosis sound? If not, what's missing?"""
+    context: { diagnosis, ws_results }
+
+# ============================================================================
+# Blocks: Remediation
+# ============================================================================
+
+block quick_fix_cycle(diagnosis, triage):
+  # Implement the fix
+  let fix = session: engineer
+    prompt: """Implement a fix for:
+
+DIAGNOSIS: {diagnosis}
+APPROACH: {triage.recommended_approach}
+
+Make the smallest change that fixes the issue.
+Commit with: fix(scope): description"""
+    permissions:
+      filesystem: ["read", "write"]
+
+  # Review loop
+  loop until **review approved** (max: 3):
+    let review = session: reviewer
+      prompt: """Review this fix:
+
+DIAGNOSIS: {diagnosis}
+IMPLEMENTATION: {fix}
+
+Does it address the root cause? Any regressions?"""
+      context: { diagnosis, fix }
+
+    if **review has blocking issues**:
+      let fix = resume: engineer
+        prompt: """Address review feedback:
+
+{review.issues}
+
+Update your fix accordingly."""
+        context: review
+        permissions:
+          filesystem: ["read", "write"]
+
+  output fix_result = { fix, review }
+
+block deploy_and_verify(fix_result):
+  # Deploy with retry
+  let deploy = session "Deploy fix"
+    prompt: """Deploy following docs/DEPLOYMENT.md.
+Verify deployment succeeded."""
+    retry: 3
+    backoff: exponential
+    permissions:
+      network: ["*"]
+
+  # Smoke test
+  let smoke = session: smoke_tester
+    prompt: """Post-deployment verification per docs/MONITORING.md:
+1. Health endpoints
+2. Verify bug is fixed
+3. Check for new errors"""
+
+  output deploy_result = { deploy, smoke, success: **smoke test passed** }
+
+block bigger_change_flow(diagnosis, triage):
+  # Build the plan
+  let plan = session: build_planner
+    prompt: """Create a build plan for:
+
+DIAGNOSIS: {diagnosis}
+TRIAGE: {triage}
+
+Follow docs/PLANNING_BEST_PRACTICES.md."""
+    context:
+      file: "docs/PLANNING_BEST_PRACTICES.md"
+
+  # User approval of plan
+  input plan_approval: **
+    Build plan created:
+    {plan}
+
+    Approve and execute?
+  **
+
+  if plan_approval != "approve":
+    output change_result = { success: false, reason: plan_approval, plan }
+    return
+
+  # Execute phases (parallel where possible)
+  let phase_results = plan.phases
+    | pmap:
+        session: engineer
+          prompt: """Execute phase:
+{item.name}
+{item.tasks}
+
+Complete tasks, run verification, commit."""
+          permissions:
+            filesystem: ["read", "write"]
+
+  # Final review
+  let review = session: reviewer
+    prompt: """Review complete implementation:
+
+PLAN: {plan}
+RESULTS: {phase_results}
+
+All phases complete? Root cause addressed?"""
+    context: { plan, phase_results }
+
+  if **review not approved**:
+    output change_result = { success: false, reason: "Review failed", review }
+    return
+
+  # Deploy
+  let deploy_result = do deploy_and_verify({ fix: phase_results, review })
+
+  output change_result = {
+    success: deploy_result.success,
+    plan,
+    phases: phase_results,
+    review,
+    deploy: deploy_result
+  }
+
+# ============================================================================
+# Main Workflow
+# ============================================================================
+
+# Phase 1: Setup
+let exec = session "Execute POST /run"
+  prompt: """POST to {api_url}/run with the test program.
+Return executionId, environmentId, wsUrl."""
+  permissions:
+    network: ["{api_url}/*"]
+
+session "Log test configuration"
+  prompt: """Log: timestamp, API URL, execution/environment IDs, program snippet."""
+  context: exec
+
+# Phase 2: Parallel Observation
+parallel:
+  ws_results = do observe_websocket(exec.wsUrl, auth_token, test_program)
+  file_results = do observe_filesystem(exec.environmentId, api_url, auth_token)
+
+# Phase 3: Synthesis
+let synthesis = session: synthesizer
+  prompt: """Synthesize observations into UX assessment.
+
+WebSocket: {ws_results}
+File Explorer: {file_results}
+
+Include error classification at the end."""
+  context: { ws_results, file_results, exec }
+
+# Phase 4: Error Remediation (if needed)
+if **blocking error detected in synthesis**:
+
+  # User checkpoint: investigate?
+  input investigate_decision: **
+    Blocking error detected:
+    {synthesis.error_summary}
+
+    Investigate and attempt remediation?
+  **
+
+  if investigate_decision == "skip":
+    output final_result = { test_results: synthesis, remediation: "skipped" }
+
+  elif investigate_decision == "investigate only":
+    let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
+    output final_result = { test_results: synthesis, diagnosis, remediation: "investigation only" }
+
+  else:
+    # Full remediation flow
+    let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
+
+    # Verification loop
+    loop until **diagnosis verified** (max: 3):
+      let verification = do verify_diagnosis(diagnosis, synthesis.error_summary, ws_results)
+
+      if verification.diagnosis_sound:
+        break
+      else:
+        let diagnosis = resume: researcher
+          prompt: """Diagnosis needs refinement:
+
+{verification.critique}
+
+Investigate: {verification.follow_up_questions}"""
+
+    # User checkpoint: confirm diagnosis before action
+    input diagnosis_confirmation: **
+      Diagnosis verified:
+      {diagnosis}
+
+      Proceed to triage and remediation?
+    **
+
+    if diagnosis_confirmation != "proceed":
+      output final_result = { test_results: synthesis, diagnosis, remediation: diagnosis_confirmation }
+
+    else:
+      # Triage
+      let triage = session: triage_expert
+        prompt: """Triage this bug: {diagnosis}"""
+        context: diagnosis
+
+      # Route based on triage
+      choice **triage decision**:
+        option "Quick fix":
+          let fix_result = do quick_fix_cycle(diagnosis, triage)
+
+          # User checkpoint before deploy
+          input deploy_decision: **
+            Fix implemented and reviewed:
+            {fix_result}
+
+            Deploy to production?
+          **
+
+          if deploy_decision == "deploy":
+            let deploy_result = do deploy_and_verify(fix_result)
+
+            if not deploy_result.success:
+              # Recursive: re-run test to verify or catch new issues
+              input retry_decision: **
+                Deployment or smoke test failed.
+                Re-run the full test to diagnose new issues?
+              **
+
+              if retry_decision == "yes":
+                # Note: This would re-invoke the program - true self-healing
+                session "Log: Triggering re-test after failed deployment"
+
+            output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: deploy_result }
+          else:
+            output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: "skipped" }
+
+        option "Bigger change":
+          # CEO checkpoint is built into bigger_change_flow
+          let change_result = do bigger_change_flow(diagnosis, triage)
+          output final_result = { test_results: synthesis, diagnosis, triage, change: change_result }
+
+else:
+  # No blocking error
+  output final_result = { test_results: synthesis, remediation: "none needed" }