638 lines
19 KiB
Plaintext
638 lines
19 KiB
Plaintext
# /run Endpoint UX Test with Error Remediation
|
|
#
|
|
# A multi-agent observation protocol for qualitative UX testing of the
|
|
# OpenProse /run endpoint, WITH automated error investigation and remediation.
|
|
#
|
|
# This extends the basic UX test with a comprehensive error handling pipeline:
|
|
# - If blocking errors are detected, investigate using logs, database, and code
|
|
# - Verify diagnosis through synthesis loop
|
|
# - Triage: quick fix vs. bigger change requiring CEO oversight
|
|
# - Quick fixes: engineer implements, deploys, tests, iterates
|
|
# - Bigger changes: build plan, parallel engineers, review, deploy, smoke test
|
|
#
|
|
# Key patterns demonstrated:
|
|
# - Mid-program `input` for user checkpoints
|
|
# - Persistent agents with `resume:` for accumulated context
|
|
# - Parallel investigation with multiple angles
|
|
# - `choice` blocks for triage decisions
|
|
# - `retry` with backoff for flaky operations
|
|
# - Recursive self-healing (if fix fails, re-test)
|
|
|
|
# Default test program (simple hello world)
|
|
const test_program = """
|
|
# Quick Hello
|
|
session "Say hello and count to 5"
|
|
"""
|
|
|
|
# Auto-auth: Read credentials from .env.test and fetch token
|
|
let api_url = session "Read API URL"
|
|
prompt: """Read the TEST_API_URL from .env.test and return just the URL.
|
|
If not found, default to: https://api-v2.prose.md"""
|
|
|
|
let auth_token = session "Authenticate"
|
|
prompt: """Read credentials from .env.test (TEST_EMAIL, TEST_PASSWORD).
|
|
Then POST to {api_url}/auth/login with these credentials.
|
|
Return just the token value (no Bearer prefix)."""
|
|
context: api_url
|
|
|
|
# ============================================================================
|
|
# Agent Definitions
|
|
# ============================================================================
|
|
|
|
# --- Observation Team ---
|
|
|
|
agent ws_observer:
|
|
model: opus
|
|
persist: true
|
|
prompt: """You are a UX researcher observing an OpenProse program execution.
|
|
|
|
Your job is to watch the WebSocket execution stream and evaluate the experience
|
|
from a USER's perspective - not as an engineer checking correctness.
|
|
|
|
Focus on:
|
|
- Latency and responsiveness (does it FEEL fast?)
|
|
- Clarity of status transitions (does the user know what's happening?)
|
|
- Quality of streamed events (are they informative? overwhelming? sparse?)
|
|
- Error messages (helpful or cryptic?)
|
|
- Overall flow (smooth or jarring?)
|
|
|
|
Log your raw observations, then periodically synthesize into user feedback.
|
|
Think: "If I were a first-time user, what would I think right now?"
|
|
"""
|
|
|
|
agent file_observer:
|
|
model: opus
|
|
persist: true
|
|
prompt: """You are a UX researcher monitoring the file system during execution.
|
|
|
|
Your job is to observe how the filesystem changes as a program runs, evaluating
|
|
whether the state management would make sense to a user browsing files.
|
|
|
|
Focus on:
|
|
- Directory structure clarity (can a user understand what's where?)
|
|
- File naming conventions (self-documenting or cryptic?)
|
|
- State file contents (readable? useful for debugging?)
|
|
- Timing of file creation/modification (predictable?)
|
|
- What a file browser UI should show
|
|
|
|
You will poll periodically and note changes between snapshots.
|
|
"""
|
|
|
|
agent synthesizer:
|
|
model: opus
|
|
prompt: """You are a senior UX researcher synthesizing observations from
|
|
multiple sources into prioritized, actionable feedback.
|
|
|
|
Your output should be:
|
|
1. Correlated findings (where did both observers notice the same thing?)
|
|
2. Prioritized action items (high/medium/low)
|
|
3. Specific quotes/evidence supporting each finding
|
|
4. Recommendations that are concrete and implementable
|
|
|
|
Be direct. "The loading state is confusing" not "Consider potentially improving..."
|
|
|
|
IMPORTANT: At the end of your synthesis, include:
|
|
|
|
## Error Classification
|
|
blocking_error: true/false
|
|
error_summary: "One-line description of the blocking error, if any"
|
|
"""
|
|
|
|
# --- Remediation Team ---
|
|
|
|
agent researcher:
|
|
model: opus
|
|
persist: true
|
|
prompt: """You are a senior engineer investigating a production error.
|
|
|
|
Your job is to diagnose the ROOT CAUSE of errors by:
|
|
1. Reading relevant log files
|
|
2. Querying the database for related records
|
|
3. Examining the source code that produced the error
|
|
4. Tracing the execution path
|
|
|
|
Be thorough but focused. Follow the evidence. Don't speculate without data.
|
|
|
|
Output a structured diagnosis:
|
|
- Error symptom: What the user/system observed
|
|
- Root cause: The underlying technical issue
|
|
- Evidence: Specific logs, code, or data supporting your diagnosis
|
|
- Confidence: High/Medium/Low
|
|
- Affected components: Which files/services are involved
|
|
"""
|
|
|
|
agent diagnosis_verifier:
|
|
model: opus
|
|
prompt: """You are a staff engineer verifying a diagnosis.
|
|
|
|
Your job is to critically evaluate a proposed diagnosis by:
|
|
1. Checking if the evidence actually supports the conclusion
|
|
2. Looking for alternative explanations
|
|
3. Verifying the logic chain from symptom to root cause
|
|
4. Identifying gaps in the investigation
|
|
|
|
Be skeptical but fair. A good diagnosis should be:
|
|
- Supported by concrete evidence (not just plausible)
|
|
- Specific (not vague like "something went wrong")
|
|
- Actionable (points to what needs to be fixed)
|
|
|
|
Output:
|
|
- diagnosis_sound: true/false
|
|
- critique: What's wrong or missing (if not sound)
|
|
- follow_up_questions: What the researcher should investigate (if not sound)
|
|
- approved_diagnosis: The verified diagnosis (if sound)
|
|
"""
|
|
|
|
agent triage_expert:
|
|
model: opus
|
|
prompt: """You are a tech lead triaging a diagnosed bug.
|
|
|
|
Evaluate the diagnosis and categorize the fix:
|
|
|
|
QUICK FIX criteria (ALL must be true):
|
|
- Isolated bug affecting < 3 files
|
|
- No architectural changes required
|
|
- No API contract changes
|
|
- No security implications
|
|
- Estimated effort < 1 hour
|
|
- Low risk of regression
|
|
|
|
BIGGER CHANGE criteria (ANY triggers this):
|
|
- Affects > 3 files or multiple services
|
|
- Requires architectural decisions
|
|
- Changes API contracts or data models
|
|
- Has security implications
|
|
- Requires CEO/stakeholder input
|
|
- High risk of regression
|
|
- Unclear solution path
|
|
|
|
Output:
|
|
- triage_decision: "quick_fix" or "bigger_change"
|
|
- rationale: Why this classification
|
|
- risk_assessment: What could go wrong
|
|
- recommended_approach: High-level fix strategy
|
|
"""
|
|
|
|
agent engineer:
|
|
model: opus
|
|
persist: true
|
|
prompt: """You are a senior engineer implementing a fix.
|
|
|
|
Your job is to:
|
|
1. Understand the diagnosis and recommended approach
|
|
2. Write clean, tested code that fixes the issue
|
|
3. Follow existing patterns in the codebase
|
|
4. Create atomic commits with clear messages
|
|
5. Verify the fix works
|
|
|
|
Do not over-engineer. Fix the issue directly and simply.
|
|
Follow the project's coding standards and testing patterns.
|
|
"""
|
|
|
|
agent build_planner:
|
|
model: opus
|
|
prompt: """You are a software architect creating a build plan.
|
|
|
|
Follow the standards in docs/PLANNING_BEST_PRACTICES.md:
|
|
- Break work into self-contained phases
|
|
- Each phase should be testable and committable
|
|
- Identify parallel work where possible
|
|
- Define clear verification criteria
|
|
- Plan for rollback
|
|
|
|
Output a structured plan with:
|
|
- Phases (numbered, with dependencies)
|
|
- Tasks per phase
|
|
- Verification steps
|
|
- Commit strategy
|
|
- Risk mitigation
|
|
"""
|
|
|
|
agent reviewer:
|
|
model: opus
|
|
prompt: """You are a senior engineer reviewing a fix.
|
|
|
|
Evaluate the implementation by:
|
|
1. Checking git diff against the original diagnosis
|
|
2. Verifying the fix addresses the root cause
|
|
3. Looking for regressions or side effects
|
|
4. Checking test coverage
|
|
5. Reviewing code quality and patterns
|
|
|
|
Be thorough but not nitpicky. Focus on correctness and safety.
|
|
|
|
Output:
|
|
- review_approved: true/false
|
|
- issues: List of blocking issues (if not approved)
|
|
- suggestions: Non-blocking improvements
|
|
- confidence: How confident are you the fix is correct
|
|
"""
|
|
|
|
agent smoke_tester:
|
|
model: opus
|
|
prompt: """You are a QA engineer performing post-deployment verification.
|
|
|
|
Follow the procedures in docs/MONITORING.md to verify:
|
|
1. Health endpoints are responding
|
|
2. The specific bug is fixed
|
|
3. No new errors in logs
|
|
4. Key metrics are stable
|
|
|
|
Output:
|
|
- smoke_test_passed: true/false
|
|
- checks_performed: List of verifications done
|
|
- issues_found: Any problems discovered
|
|
- recommendations: Monitoring or follow-up suggestions
|
|
"""
|
|
|
|
# ============================================================================
|
|
# Blocks: Observation
|
|
# ============================================================================
|
|
|
|
block observe_websocket(ws_url, token, program):
|
|
session: ws_observer
|
|
prompt: """Connect to the WebSocket at:
|
|
{ws_url}&token={token}
|
|
|
|
Once connected, send the execute message:
|
|
{"type":"execute","program":<the program>}
|
|
|
|
Program:
|
|
```
|
|
{program}
|
|
```
|
|
|
|
Log your initial connection experience."""
|
|
|
|
loop until **execution completed (received status: completed/failed/aborted)**:
|
|
resume: ws_observer
|
|
prompt: """Continue observing the WebSocket stream.
|
|
|
|
Log each message with timestamp, type, content, and your interpretation.
|
|
After every 3-5 messages, synthesize: what would a user be thinking?"""
|
|
|
|
output ws_feedback = resume: ws_observer
|
|
prompt: """The execution has completed. Write your final assessment:
|
|
1. Total duration and event count
|
|
2. Status transitions observed
|
|
3. What worked well from a UX perspective
|
|
4. Pain points and confusion
|
|
5. Top 3 recommendations"""
|
|
|
|
block observe_filesystem(env_id, api_url, token):
|
|
session: file_observer
|
|
prompt: """Fetch the initial file tree:
|
|
GET {api_url}/environments/{env_id}/files/tree?depth=3
|
|
Authorization: Bearer {token}
|
|
|
|
Log the baseline directory structure."""
|
|
permissions:
|
|
network: ["{api_url}/*"]
|
|
|
|
let snapshot_count = 0
|
|
|
|
loop until **websocket observer signals completion** (max: 30):
|
|
let snapshot_count = snapshot_count + 1
|
|
|
|
resume: file_observer
|
|
prompt: """Snapshot #{snapshot_count}: Fetch and compare file tree.
|
|
Log what's NEW, MODIFIED, and any interesting state files to read."""
|
|
permissions:
|
|
network: ["{api_url}/*"]
|
|
|
|
output file_feedback = resume: file_observer
|
|
prompt: """Final filesystem assessment:
|
|
1. Total snapshots taken
|
|
2. Files created during execution
|
|
3. State file clarity
|
|
4. Top 3 recommendations"""
|
|
|
|
# ============================================================================
|
|
# Blocks: Investigation
|
|
# ============================================================================
|
|
|
|
block investigate_error(error_summary, ws_results, file_results, exec_info):
|
|
# Parallel investigation from multiple angles
|
|
parallel:
|
|
code_analysis = session: researcher
|
|
prompt: """Investigate the CODE PATH for this error:
|
|
|
|
ERROR: {error_summary}
|
|
|
|
Search the codebase for:
|
|
1. The execution logic that produced this error
|
|
2. Error handling paths
|
|
3. Recent changes to related code (git log)
|
|
|
|
Focus on understanding HOW this error was produced."""
|
|
permissions:
|
|
filesystem: ["read"]
|
|
|
|
log_analysis = session: researcher
|
|
prompt: """Investigate the LOGS for this error:
|
|
|
|
ERROR: {error_summary}
|
|
|
|
WebSocket observations:
|
|
{ws_results}
|
|
|
|
File explorer observations:
|
|
{file_results}
|
|
|
|
Look for:
|
|
1. Error messages and stack traces
|
|
2. Timing of events
|
|
3. Any warnings before the error"""
|
|
context: { ws_results, file_results }
|
|
|
|
context_analysis = session: researcher
|
|
prompt: """Investigate the EXECUTION CONTEXT:
|
|
|
|
ERROR: {error_summary}
|
|
|
|
Execution info:
|
|
{exec_info}
|
|
|
|
Check:
|
|
1. Environment state
|
|
2. Database records for this execution
|
|
3. Any configuration issues"""
|
|
context: exec_info
|
|
permissions:
|
|
database: ["read"]
|
|
|
|
# Synthesize findings from all angles
|
|
output diagnosis = resume: researcher
|
|
prompt: """Synthesize your parallel investigations into a unified diagnosis:
|
|
|
|
Code analysis: {code_analysis}
|
|
Log analysis: {log_analysis}
|
|
Context analysis: {context_analysis}
|
|
|
|
Provide:
|
|
- Root cause (specific and actionable)
|
|
- Evidence chain
|
|
- Confidence level
|
|
- Affected components"""
|
|
context: { code_analysis, log_analysis, context_analysis }
|
|
|
|
block verify_diagnosis(diagnosis, original_error, ws_results):
|
|
output verification = session: diagnosis_verifier
|
|
prompt: """Verify this diagnosis:
|
|
|
|
DIAGNOSIS:
|
|
{diagnosis}
|
|
|
|
ORIGINAL ERROR:
|
|
{original_error}
|
|
|
|
OBSERVATIONS:
|
|
{ws_results}
|
|
|
|
Is this diagnosis sound? If not, what's missing?"""
|
|
context: { diagnosis, ws_results }
|
|
|
|
# ============================================================================
|
|
# Blocks: Remediation
|
|
# ============================================================================
|
|
|
|
block quick_fix_cycle(diagnosis, triage):
|
|
# Implement the fix
|
|
let fix = session: engineer
|
|
prompt: """Implement a fix for:
|
|
|
|
DIAGNOSIS: {diagnosis}
|
|
APPROACH: {triage.recommended_approach}
|
|
|
|
Make the smallest change that fixes the issue.
|
|
Commit with: fix(scope): description"""
|
|
permissions:
|
|
filesystem: ["read", "write"]
|
|
|
|
# Review loop
|
|
loop until **review approved** (max: 3):
|
|
let review = session: reviewer
|
|
prompt: """Review this fix:
|
|
|
|
DIAGNOSIS: {diagnosis}
|
|
IMPLEMENTATION: {fix}
|
|
|
|
Does it address the root cause? Any regressions?"""
|
|
context: { diagnosis, fix }
|
|
|
|
if **review has blocking issues**:
|
|
let fix = resume: engineer
|
|
prompt: """Address review feedback:
|
|
|
|
{review.issues}
|
|
|
|
Update your fix accordingly."""
|
|
context: review
|
|
permissions:
|
|
filesystem: ["read", "write"]
|
|
|
|
output fix_result = { fix, review }
|
|
|
|
block deploy_and_verify(fix_result):
|
|
# Deploy with retry
|
|
let deploy = session "Deploy fix"
|
|
prompt: """Deploy following docs/DEPLOYMENT.md.
|
|
Verify deployment succeeded."""
|
|
retry: 3
|
|
backoff: exponential
|
|
permissions:
|
|
network: ["*"]
|
|
|
|
# Smoke test
|
|
let smoke = session: smoke_tester
|
|
prompt: """Post-deployment verification per docs/MONITORING.md:
|
|
1. Health endpoints
|
|
2. Verify bug is fixed
|
|
3. Check for new errors"""
|
|
|
|
output deploy_result = { deploy, smoke, success: **smoke test passed** }
|
|
|
|
block bigger_change_flow(diagnosis, triage):
|
|
# Build the plan
|
|
let plan = session: build_planner
|
|
prompt: """Create a build plan for:
|
|
|
|
DIAGNOSIS: {diagnosis}
|
|
TRIAGE: {triage}
|
|
|
|
Follow docs/PLANNING_BEST_PRACTICES.md."""
|
|
context:
|
|
file: "docs/PLANNING_BEST_PRACTICES.md"
|
|
|
|
# User approval of plan
|
|
input plan_approval: **
|
|
Build plan created:
|
|
{plan}
|
|
|
|
Approve and execute?
|
|
**
|
|
|
|
if plan_approval != "approve":
|
|
output change_result = { success: false, reason: plan_approval, plan }
|
|
return
|
|
|
|
# Execute phases (parallel where possible)
|
|
let phase_results = plan.phases
|
|
| pmap:
|
|
session: engineer
|
|
prompt: """Execute phase:
|
|
{item.name}
|
|
{item.tasks}
|
|
|
|
Complete tasks, run verification, commit."""
|
|
permissions:
|
|
filesystem: ["read", "write"]
|
|
|
|
# Final review
|
|
let review = session: reviewer
|
|
prompt: """Review complete implementation:
|
|
|
|
PLAN: {plan}
|
|
RESULTS: {phase_results}
|
|
|
|
All phases complete? Root cause addressed?"""
|
|
context: { plan, phase_results }
|
|
|
|
if **review not approved**:
|
|
output change_result = { success: false, reason: "Review failed", review }
|
|
return
|
|
|
|
# Deploy
|
|
let deploy_result = do deploy_and_verify({ fix: phase_results, review })
|
|
|
|
output change_result = {
|
|
success: deploy_result.success,
|
|
plan,
|
|
phases: phase_results,
|
|
review,
|
|
deploy: deploy_result
|
|
}
|
|
|
|
# ============================================================================
|
|
# Main Workflow
|
|
# ============================================================================
|
|
|
|
# Phase 1: Setup
|
|
let exec = session "Execute POST /run"
|
|
prompt: """POST to {api_url}/run with the test program.
|
|
Return executionId, environmentId, wsUrl."""
|
|
permissions:
|
|
network: ["{api_url}/*"]
|
|
|
|
session "Log test configuration"
|
|
prompt: """Log: timestamp, API URL, execution/environment IDs, program snippet."""
|
|
context: exec
|
|
|
|
# Phase 2: Parallel Observation
|
|
parallel:
|
|
ws_results = do observe_websocket(exec.wsUrl, auth_token, test_program)
|
|
file_results = do observe_filesystem(exec.environmentId, api_url, auth_token)
|
|
|
|
# Phase 3: Synthesis
|
|
let synthesis = session: synthesizer
|
|
prompt: """Synthesize observations into UX assessment.
|
|
|
|
WebSocket: {ws_results}
|
|
File Explorer: {file_results}
|
|
|
|
Include error classification at the end."""
|
|
context: { ws_results, file_results, exec }
|
|
|
|
# Phase 4: Error Remediation (if needed)
|
|
if **blocking error detected in synthesis**:
|
|
|
|
# User checkpoint: investigate?
|
|
input investigate_decision: **
|
|
Blocking error detected:
|
|
{synthesis.error_summary}
|
|
|
|
Investigate and attempt remediation?
|
|
**
|
|
|
|
if investigate_decision == "skip":
|
|
output final_result = { test_results: synthesis, remediation: "skipped" }
|
|
|
|
elif investigate_decision == "investigate only":
|
|
let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
|
|
output final_result = { test_results: synthesis, diagnosis, remediation: "investigation only" }
|
|
|
|
else:
|
|
# Full remediation flow
|
|
let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
|
|
|
|
# Verification loop
|
|
loop until **diagnosis verified** (max: 3):
|
|
let verification = do verify_diagnosis(diagnosis, synthesis.error_summary, ws_results)
|
|
|
|
if verification.diagnosis_sound:
|
|
break
|
|
else:
|
|
let diagnosis = resume: researcher
|
|
prompt: """Diagnosis needs refinement:
|
|
|
|
{verification.critique}
|
|
|
|
Investigate: {verification.follow_up_questions}"""
|
|
|
|
# User checkpoint: confirm diagnosis before action
|
|
input diagnosis_confirmation: **
|
|
Diagnosis verified:
|
|
{diagnosis}
|
|
|
|
Proceed to triage and remediation?
|
|
**
|
|
|
|
if diagnosis_confirmation != "proceed":
|
|
output final_result = { test_results: synthesis, diagnosis, remediation: diagnosis_confirmation }
|
|
|
|
else:
|
|
# Triage
|
|
let triage = session: triage_expert
|
|
prompt: """Triage this bug: {diagnosis}"""
|
|
context: diagnosis
|
|
|
|
# Route based on triage
|
|
choice **triage decision**:
|
|
option "Quick fix":
|
|
let fix_result = do quick_fix_cycle(diagnosis, triage)
|
|
|
|
# User checkpoint before deploy
|
|
input deploy_decision: **
|
|
Fix implemented and reviewed:
|
|
{fix_result}
|
|
|
|
Deploy to production?
|
|
**
|
|
|
|
if deploy_decision == "deploy":
|
|
let deploy_result = do deploy_and_verify(fix_result)
|
|
|
|
if not deploy_result.success:
|
|
# Recursive: re-run test to verify or catch new issues
|
|
input retry_decision: **
|
|
Deployment or smoke test failed.
|
|
Re-run the full test to diagnose new issues?
|
|
**
|
|
|
|
if retry_decision == "yes":
|
|
# Note: This would re-invoke the program - true self-healing
|
|
session "Log: Triggering re-test after failed deployment"
|
|
|
|
output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: deploy_result }
|
|
else:
|
|
output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: "skipped" }
|
|
|
|
option "Bigger change":
|
|
# CEO checkpoint is built into bigger_change_flow
|
|
let change_result = do bigger_change_flow(diagnosis, triage)
|
|
output final_result = { test_results: synthesis, diagnosis, triage, change: change_result }
|
|
|
|
else:
|
|
# No blocking error
|
|
output final_result = { test_results: synthesis, remediation: "none needed" }
|