commit 56486a03ae2320e4d0826f3ef3d67a377a061a41 Author: Zhongwei Li Date: Sun Nov 30 08:39:00 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..9c8cd69 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "orchestration", + "description": "Shared multi-agent coordination and workflow orchestration patterns for complex Claude Code workflows. Skills-only plugin providing proven patterns for parallel execution (3-5x speedup), multi-model validation (Grok/Gemini/GPT-5), quality gates, TDD loops, TodoWrite phase tracking, and comprehensive error recovery. Battle-tested patterns from 100+ days production use.", + "version": "0.1.1", + "author": { + "name": "Jack Rudenko", + "email": "i@madappgang.com", + "company": "MadAppGang" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f02abe8 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# orchestration + +Shared multi-agent coordination and workflow orchestration patterns for complex Claude Code workflows. Skills-only plugin providing proven patterns for parallel execution (3-5x speedup), multi-model validation (Grok/Gemini/GPT-5), quality gates, TDD loops, TodoWrite phase tracking, and comprehensive error recovery. Battle-tested patterns from 100+ days production use. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..96ac5e3 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,61 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:MadAppGang/claude-code:plugins/orchestration", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "ad90df36843224b97a17f14cfd5a207d4e053c67", + "treeHash": "811ec6920184f4235cc78d0b9ca0025fae96488caf35059ca1224e8d5cb24150", + "generatedAt": "2025-11-28T10:12:05.859643Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "orchestration", + "description": "Shared multi-agent coordination and workflow orchestration patterns for complex Claude Code workflows. Skills-only plugin providing proven patterns for parallel execution (3-5x speedup), multi-model validation (Grok/Gemini/GPT-5), quality gates, TDD loops, TodoWrite phase tracking, and comprehensive error recovery. Battle-tested patterns from 100+ days production use.", + "version": "0.1.1" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "215babb6dff86f8783d8e97d0a21546e2aaa3b055bc1cde5c4e16c6bf3d6c7a5" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "36414e18947889714f9d80576e01edaab8b3ffdf9efd44107e0f5fb42b0e2270" + }, + { + "path": "skills/todowrite-orchestration/SKILL.md", + "sha256": "f681467a2eef99945f90b8f2b654c8c9713f4153afdff19a0c0b312d2f6084de" + }, + { + "path": "skills/quality-gates/SKILL.md", + "sha256": "ba13c21d8e9f8abeb856bbec4a6ebc821e92dfe0857942797959087452b175c3" + }, + { + "path": "skills/error-recovery/SKILL.md", + "sha256": "133564d1bc0d35a8c35074b089120fe7d7a757b71bdd6222a7a5c23e45f20aa3" + }, + { + "path": "skills/multi-agent-coordination/SKILL.md", + "sha256": "9e0156350eb09447221898598611a5270921c31168e7698c4bd0d3bd0ced4616" + }, + { + "path": "skills/multi-model-validation/SKILL.md", + "sha256": "9d5c46dfa531f911f4fcc4070fd6c039900bcdb440c997f7eac384001a1ba33e" + } + ], + "dirSha256": "811ec6920184f4235cc78d0b9ca0025fae96488caf35059ca1224e8d5cb24150" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/error-recovery/SKILL.md b/skills/error-recovery/SKILL.md new file mode 100644 index 0000000..3daaad8 --- /dev/null +++ b/skills/error-recovery/SKILL.md @@ -0,0 +1,1107 @@ +--- +name: error-recovery +description: Handle errors, timeouts, and failures in multi-agent workflows. Use when dealing with external model timeouts, API failures, partial success, user cancellation, or graceful degradation. Trigger keywords - "error", "failure", "timeout", "retry", "fallback", "cancelled", "graceful degradation", "recovery", "partial success". +version: 0.1.0 +tags: [orchestration, error-handling, retry, fallback, timeout, recovery] +keywords: [error, failure, timeout, retry, fallback, graceful-degradation, cancellation, recovery, partial-success, resilience] +--- + +# Error Recovery + +**Version:** 1.0.0 +**Purpose:** Patterns for handling failures in multi-agent workflows +**Status:** Production Ready + +## Overview + +Error recovery is the practice of handling failures gracefully in multi-agent workflows, ensuring that temporary errors, timeouts, or partial failures don't derail entire workflows. In production systems with external dependencies (AI models, APIs, network calls), failures are inevitable. The question is not "will it fail?" but "how will we handle it when it does?" + +This skill provides battle-tested patterns for: +- **Timeout handling** (external models taking >30s) +- **API failure recovery** (401, 500, network errors) +- **Partial success strategies** (some agents succeed, others fail) +- **User cancellation** (graceful Ctrl+C handling) +- **Missing tools** (claudish not installed) +- **Out of credits** (payment/quota errors) +- **Retry strategies** (exponential backoff, max retries) + +With proper error recovery, workflows become **resilient** and **production-ready**. + +## Core Patterns + +### Pattern 1: Timeout Handling + +**Scenario: External Model Takes >30s** + +External AI models via Claudish may take >30s due to: +- Model service overloaded (high demand) +- Network latency (slow connection) +- Complex task (large input, detailed analysis) +- Model thinking time (GPT-5, Grok reasoning models) + +**Detection:** + +``` +Monitor execution time and set timeout limits: + +const TIMEOUT_THRESHOLD = 30000; // 30 seconds + +startTime = Date.now(); +executeClaudish(model, prompt); + +setInterval(() => { + elapsedTime = Date.now() - startTime; + if (elapsedTime > TIMEOUT_THRESHOLD && !modelResponded) { + handleTimeout(); + } +}, 1000); +``` + +**Recovery Strategy:** + +``` +Step 1: Detect Timeout + Log: "Timeout: x-ai/grok-code-fast-1 after 30s with no response" + +Step 2: Notify User + Present options: + "Model 'Grok' timed out after 30 seconds. + Options: + 1. Retry with 60s timeout + 2. Skip this model and continue with others + 3. Cancel entire workflow + + What would you like to do? (1/2/3)" + +Step 3a: User selects RETRY + Increase timeout to 60s + Re-execute claudish with longer timeout + If still times out: Offer skip or cancel + +Step 3b: User selects SKIP + Log: "Skipping Grok review due to timeout" + Mark this model as failed + Continue with remaining models + (Graceful degradation pattern) + +Step 3c: User selects CANCEL + Exit workflow gracefully + Save partial results (if any) + Log cancellation reason +``` + +**Graceful Degradation:** + +``` +Multi-Model Review Example: + +Requested: 5 models (Claude, Grok, Gemini, GPT-5, DeepSeek) +Timeout: Grok after 30s + +Result: + - Claude: Success ✓ + - Grok: Timeout ✗ (skipped) + - Gemini: Success ✓ + - GPT-5: Success ✓ + - DeepSeek: Success ✓ + +Successful: 4/5 models (80%) +Threshold: N ≥ 2 for consolidation ✓ + +Action: + Proceed with consolidation using 4 reviews + Notify user: "4/5 models completed (Grok timeout). Proceeding with 4-model consensus." + +Benefits: + - Workflow completes despite failure + - User gets results (4 models better than 1) + - Timeout doesn't derail entire workflow +``` + +**Example Implementation:** + +```bash +# In codex-code-reviewer agent (proxy mode) + +MODEL="x-ai/grok-code-fast-1" +TIMEOUT=30 + +# Execute with timeout +RESULT=$(timeout ${TIMEOUT}s bash -c " + printf '%s' '$PROMPT' | claudish --model $MODEL --stdin --quiet --auto-approve +" 2>&1) + +# Check exit code +if [ $? -eq 124 ]; then + # Timeout occurred (exit code 124 from timeout command) + echo "⚠️ Timeout: Model $MODEL exceeded ${TIMEOUT}s" >&2 + echo "TIMEOUT_ERROR: Model did not respond within ${TIMEOUT}s" + exit 1 +fi + +# Success - write results +echo "$RESULT" > ai-docs/grok-review.md +echo "Grok review complete. See ai-docs/grok-review.md" +``` + +--- + +### Pattern 2: API Failure Recovery + +**Common API Failure Scenarios:** + +``` +401 Unauthorized: + - Invalid API key (OPENROUTER_API_KEY incorrect) + - Expired API key + - API key not set in environment + +500 Internal Server Error: + - Model service temporarily down + - Server overload + - Model deployment issue + +Network Errors: + - Connection timeout (network slow/unstable) + - DNS resolution failure + - Firewall blocking request + +429 Too Many Requests: + - Rate limit exceeded + - Too many concurrent requests + - Quota exhausted for time window +``` + +**Recovery Strategies by Error Type:** + +**401 Unauthorized:** + +``` +Detection: + API returns 401 status code + +Recovery: + 1. Log: "API authentication failed (401)" + 2. Check if OPENROUTER_API_KEY is set: + if [ -z "$OPENROUTER_API_KEY" ]; then + notifyUser("OpenRouter API key not found. Set OPENROUTER_API_KEY in .env") + else + notifyUser("Invalid OpenRouter API key. Check .env file") + fi + 3. Skip all external models + 4. Fallback to embedded Claude only + 5. Notify user: + "⚠️ API authentication failed. Falling back to embedded Claude. + To fix: Add valid OPENROUTER_API_KEY to .env file." + +No retry (authentication won't fix itself) +``` + +**500 Internal Server Error:** + +``` +Detection: + API returns 500 status code + +Recovery: + 1. Log: "Model service error (500): x-ai/grok-code-fast-1" + 2. Wait 5 seconds (give service time to recover) + 3. Retry ONCE + 4. If retry succeeds: Continue normally + 5. If retry fails: Skip this model, continue with others + +Example: + try { + result = await claudish(model, prompt); + } catch (error) { + if (error.status === 500) { + log("500 error, waiting 5s before retry..."); + await sleep(5000); + + try { + result = await claudish(model, prompt); // Retry + log("Retry succeeded"); + } catch (retryError) { + log("Retry failed, skipping model"); + skipModel(model); + continueWithRemaining(); + } + } + } + +Max retries: 1 (avoid long delays) +``` + +**Network Errors:** + +``` +Detection: + - Connection timeout + - ECONNREFUSED + - ETIMEDOUT + - DNS resolution failure + +Recovery: + Retry up to 3 times with exponential backoff: + + async function retryWithBackoff(fn, maxRetries = 3) { + for (let i = 0; i < maxRetries; i++) { + try { + return await fn(); + } catch (error) { + if (!isNetworkError(error)) throw error; // Not retriable + if (i === maxRetries - 1) throw error; // Max retries reached + + const delay = Math.pow(2, i) * 1000; // 1s, 2s, 4s + log(`Network error, retrying in ${delay}ms (attempt ${i+1}/${maxRetries})`); + await sleep(delay); + } + } + } + + result = await retryWithBackoff(() => claudish(model, prompt)); + +Rationale: Network errors are often transient (temporary) +``` + +**429 Rate Limiting:** + +``` +Detection: + API returns 429 status code + Response may include Retry-After header + +Recovery: + 1. Check Retry-After header (seconds to wait) + 2. If present: Wait for specified time + 3. If not present: Wait 60s (default) + 4. Retry ONCE after waiting + 5. If still rate limited: Skip model + +Example: + if (error.status === 429) { + const retryAfter = error.headers['retry-after'] || 60; + log(`Rate limited. Waiting ${retryAfter}s before retry...`); + await sleep(retryAfter * 1000); + + try { + result = await claudish(model, prompt); + } catch (retryError) { + log("Still rate limited after retry. Skipping model."); + skipModel(model); + } + } + +Note: Respect Retry-After header (avoid hammering API) +``` + +**Graceful Degradation for All API Failures:** + +``` +Fallback Strategy: + +If ALL external models fail (401, 500, network, etc.): + 1. Log all failures + 2. Notify user: + "⚠️ All external models failed. Falling back to embedded Claude. + Errors: + - Grok: Network timeout + - Gemini: 500 Internal Server Error + - GPT-5: Rate limited (429) + - DeepSeek: Authentication failed (401) + + Proceeding with Claude Sonnet (embedded) only." + + 3. Run embedded Claude review + 4. Present results with disclaimer: + "Review completed using Claude only (external models unavailable). + For multi-model consensus, try again later." + +Benefits: + - User still gets results (better than nothing) + - Workflow completes (not aborted) + - Clear error communication (user knows what happened) +``` + +--- + +### Pattern 3: Partial Success Strategies + +**Scenario: 2 of 4 Models Complete Successfully** + +In multi-model workflows, it's common for some models to succeed while others fail. + +**Tracking Success/Failure:** + +``` +const results = await Promise.allSettled([ + Task({ subagent: "reviewer", model: "claude" }), + Task({ subagent: "reviewer", model: "grok" }), + Task({ subagent: "reviewer", model: "gemini" }), + Task({ subagent: "reviewer", model: "gpt-5" }) +]); + +const successful = results.filter(r => r.status === 'fulfilled'); +const failed = results.filter(r => r.status === 'rejected'); + +log(`Success: ${successful.length}/4`); +log(`Failed: ${failed.length}/4`); +``` + +**Decision Logic:** + +``` +If N ≥ 2 successful: + → Proceed with consolidation + → Use N reviews (not all 4) + → Notify user about failures + +If N < 2 successful: + → Insufficient data for consensus + → Offer user choice: + 1. Retry failures + 2. Abort workflow + 3. Proceed with embedded Claude only + +Example: + +successful.length = 2 (Claude, Gemini) +failed.length = 2 (Grok timeout, GPT-5 500 error) + +Action: + notifyUser("2/4 models completed successfully. Proceeding with consolidation using 2 reviews."); + + consolidateReviews([ + "ai-docs/claude-review.md", + "ai-docs/gemini-review.md" + ]); + + presentResults({ + totalModels: 4, + successful: 2, + failureReasons: { + grok: "Timeout after 30s", + gpt5: "500 Internal Server Error" + } + }); +``` + +**Communication Strategy:** + +``` +Be transparent with user about partial success: + +❌ WRONG: + "Multi-model review complete!" + (User assumes all 4 models ran) + +✅ CORRECT: + "Multi-model review complete (2/4 models succeeded). + + Successful: + - Claude Sonnet ✓ + - Gemini 2.5 Flash ✓ + + Failed: + - Grok: Timeout after 30s + - GPT-5 Codex: 500 Internal Server Error + + Proceeding with 2-model consensus. + Top issues: [...]" + +User knows: + - What succeeded (Claude, Gemini) + - What failed (Grok, GPT-5) + - Why they failed (timeout, 500 error) + - What action was taken (2-model consensus) +``` + +**Consolidation Adapts to N Models:** + +``` +Consolidation logic must handle variable N: + +✅ CORRECT - Flexible N: + function consolidateReviews(reviewFiles) { + const N = reviewFiles.length; + log(`Consolidating ${N} reviews`); + + // Consensus thresholds adapt to N + const unanimousThreshold = N; // All N agree + const strongThreshold = Math.ceil(N * 0.67); // 67%+ agree + const majorityThreshold = Math.ceil(N * 0.5); // 50%+ agree + + // Apply consensus analysis with dynamic thresholds + ... + } + +❌ WRONG - Hardcoded N: + // Assumes always 4 models + const unanimousThreshold = 4; // Breaks if N = 2! +``` + +--- + +### Pattern 4: User Cancellation Handling (Ctrl+C) + +**Scenario: User Presses Ctrl+C During Workflow** + +Users may cancel long-running workflows for various reasons: +- Taking too long +- Realized they want different configuration +- Accidentally triggered workflow +- Need to prioritize other work + +**Cleanup Strategy:** + +``` +process.on('SIGINT', async () => { + log("⚠️ User cancelled workflow (Ctrl+C)"); + + // Step 1: Stop all running processes gracefully + await stopAllAgents(); + + // Step 2: Save partial results to files + const partialResults = await collectPartialResults(); + await writeFile('ai-docs/partial-review.md', partialResults); + + // Step 3: Log what was completed vs cancelled + log("Workflow cancelled"); + log("Completed:"); + log(" - PHASE 1: Requirements gathering ✓"); + log(" - PHASE 2: Architecture planning ✓"); + log("Cancelled:"); + log(" - PHASE 3: Implementation (in progress)"); + log(" - PHASE 4: Testing (not started)"); + log(" - PHASE 5: Review (not started)"); + + // Step 4: Notify user + console.log("\n⚠️ Workflow cancelled by user."); + console.log("Partial results saved to ai-docs/partial-review.md"); + console.log("Completed phases: 2/5"); + + // Step 5: Clean exit + process.exit(0); +}); +``` + +**Save Partial Results:** + +``` +Partial Results Format: + +# Workflow Cancelled by User + +**Status:** Cancelled during PHASE 3 (Implementation) +**Completed:** 2/5 phases (40%) +**Duration:** 8 minutes (of estimated 20 minutes) +**Timestamp:** 2025-11-22T14:30:00Z + +## Completed Phases + +### PHASE 1: Requirements Gathering ✓ +- User requirements documented +- See: ai-docs/requirements.md + +### PHASE 2: Architecture Planning ✓ +- Architecture plan generated +- See: ai-docs/architecture-plan.md + +## Cancelled Phases + +### PHASE 3: Implementation (IN PROGRESS) +- Status: 30% complete +- Files created: src/auth.ts (partial) +- Files pending: src/routes.ts, src/services.ts + +### PHASE 4: Testing (NOT STARTED) +- Pending: Test suite creation + +### PHASE 5: Code Review (NOT STARTED) +- Pending: Multi-model review + +## How to Resume + +To resume from PHASE 3: +1. Review partial implementation in src/auth.ts +2. Complete remaining implementation +3. Continue with PHASE 4 (Testing) + +Or restart workflow from beginning with updated requirements. +``` + +**Resumable Workflows (Advanced):** + +``` +Save workflow state for potential resume: + +// During workflow execution +await saveWorkflowState({ + currentPhase: 3, + totalPhases: 5, + completedPhases: [1, 2], + pendingPhases: [3, 4, 5], + partialResults: { + phase1: "ai-docs/requirements.md", + phase2: "ai-docs/architecture-plan.md", + phase3: "src/auth.ts (partial)" + } +}, '.claude/workflow-state.json'); + +// On next invocation +const state = await loadWorkflowState('.claude/workflow-state.json'); +if (state) { + askUser("Found incomplete workflow from previous session. Resume? (Yes/No)"); + + if (userSaysYes) { + resumeFromPhase(state.currentPhase); + } else { + deleteWorkflowState(); + startFresh(); + } +} +``` + +--- + +### Pattern 5: Claudish Not Installed + +**Scenario: User Requests Multi-Model Review but Claudish Missing** + +**Detection:** + +``` +Check if claudish CLI is installed: + +Bash: which claudish +Exit code 0: Installed ✓ +Exit code 1: Not installed ✗ + +Or: + +Bash: claudish --version +Output: "claudish version 2.2.1" → Installed ✓ +Error: "command not found" → Not installed ✗ +``` + +**Recovery Strategy:** + +``` +Step 1: Detect Missing Claudish + hasClaudish = checkCommand('which claudish'); + + if (!hasClaudish) { + log("Claudish CLI not found"); + notifyUser(); + } + +Step 2: Notify User with Installation Instructions + "⚠️ Claudish CLI not found. External AI models unavailable. + + To enable multi-model review: + 1. Install: npm install -g claudish + 2. Configure: Set OPENROUTER_API_KEY in .env + 3. Re-run this command + + For now, falling back to embedded Claude Sonnet only." + +Step 3: Fallback to Embedded Claude + log("Falling back to embedded Claude review"); + runEmbeddedReviewOnly(); + +Benefits: + - Workflow doesn't fail (graceful degradation) + - User gets results (Claude review) + - Clear instructions for enabling multi-model (future use) +``` + +**Example Implementation:** + +``` +Phase 2: Model Selection + +Bash: which claudish +if [ $? -ne 0 ]; then + # Claudish not installed + echo "⚠️ Claudish CLI not found." + echo "Install: npm install -g claudish" + echo "Falling back to embedded Claude only." + + # Skip external model selection + selectedModels=["claude-sonnet"] +else + # Claudish available + echo "Claudish CLI found ✓" + # Proceed with external model selection + selectedModels=["claude-sonnet", "grok", "gemini", "gpt-5"] +fi +``` + +--- + +### Pattern 6: Out of OpenRouter Credits + +**Scenario: External Model API Call Fails Due to Insufficient Credits** + +**Detection:** + +``` +API returns: + - 402 Payment Required (HTTP status) + - Or error message contains "credits", "quota", "billing" + +Example error messages: + - "Insufficient credits" + - "Credit balance too low" + - "Quota exceeded" + - "Payment required" +``` + +**Recovery Strategy:** + +``` +Step 1: Detect Credit Exhaustion + if (error.status === 402 || error.message.includes('credits')) { + handleCreditExhaustion(); + } + +Step 2: Log Event + log("OpenRouter credits exhausted"); + +Step 3: Notify User + "⚠️ OpenRouter credits exhausted. External models unavailable. + + To fix: + 1. Visit https://openrouter.ai + 2. Add credits to your account + 3. Re-run this command + + For now, falling back to embedded Claude Sonnet." + +Step 4: Skip All External Models + skipAllExternalModels(); + +Step 5: Fallback to Embedded Claude + runEmbeddedReviewOnly(); + +Benefits: + - Workflow completes (doesn't fail) + - User gets results (Claude review) + - Clear instructions for adding credits +``` + +**Proactive Credit Check (Advanced):** + +``` +Before expensive multi-model operation: + +Step 1: Check OpenRouter Credit Balance + Bash: curl -H "Authorization: Bearer $OPENROUTER_API_KEY" \ + https://openrouter.ai/api/v1/auth/key + + Response: { "data": { "usage": 1.23, "limit": 10.00 } } + +Step 2: Estimate Cost + estimatedCost = 0.008 // From cost estimation pattern + +Step 3: Check if Sufficient Credits + remainingCredits = 10.00 - 1.23 = 8.77 + if (estimatedCost > remainingCredits) { + warnUser("Insufficient credits ($8.77 remaining, $0.008 needed)"); + } + +Benefits: + - Warn before operation (not after failure) + - User can add credits first (avoid wasted time) +``` + +--- + +### Pattern 7: Retry Strategies + +**Exponential Backoff:** + +``` +Retry with increasing delays to avoid overwhelming services: + +Retry Schedule: + 1st retry: Wait 1 second + 2nd retry: Wait 2 seconds + 3rd retry: Wait 4 seconds + Max retries: 3 + +Formula: delay = 2^attempt × 1000ms + +async function retryWithBackoff(fn, maxRetries = 3) { + for (let attempt = 0; attempt < maxRetries; attempt++) { + try { + return await fn(); + } catch (error) { + if (!isRetriable(error)) { + throw error; // Don't retry non-retriable errors + } + + if (attempt === maxRetries - 1) { + throw error; // Max retries reached + } + + const delay = Math.pow(2, attempt) * 1000; + log(`Retry ${attempt + 1}/${maxRetries} after ${delay}ms`); + await sleep(delay); + } + } +} +``` + +**When to Retry:** + +``` +Retriable Errors (temporary, retry likely to succeed): + ✓ Network errors (ETIMEDOUT, ECONNREFUSED) + ✓ 500 Internal Server Error (service temporarily down) + ✓ 503 Service Unavailable (overloaded, retry later) + ✓ 429 Rate Limiting (wait for reset, then retry) + +Non-Retriable Errors (permanent, retry won't help): + ✗ 401 Unauthorized (bad credentials) + ✗ 403 Forbidden (insufficient permissions) + ✗ 404 Not Found (model doesn't exist) + ✗ 400 Bad Request (invalid input) + ✗ User cancellation (SIGINT) + +Function: + function isRetriable(error) { + const retriableCodes = [500, 503, 429]; + const retriableTypes = ['ETIMEDOUT', 'ECONNREFUSED', 'ENOTFOUND']; + + return ( + retriableCodes.includes(error.status) || + retriableTypes.includes(error.code) + ); + } +``` + +**Max Retry Limits:** + +``` +Set appropriate max retries by operation type: + +Network requests: 3 retries (transient failures) +API calls: 1-2 retries (avoid long delays) +User input: 0 retries (ask user to retry manually) + +Example: + result = await retryWithBackoff( + () => claudish(model, prompt), + maxRetries: 2 // 2 retries for API calls + ); +``` + +--- + +## Integration with Other Skills + +**error-recovery + multi-model-validation:** + +``` +Use Case: Handling external model failures in parallel execution + +Step 1: Parallel Execution (multi-model-validation) + Launch 5 models simultaneously + +Step 2: Error Recovery (error-recovery) + Model 1: Success ✓ + Model 2: Timeout → Skip (timeout handling pattern) + Model 3: 500 error → Retry once, then skip + Model 4: Success ✓ + Model 5: Success ✓ + +Step 3: Partial Success Strategy (error-recovery) + 3/5 successful (≥ 2 threshold) + Proceed with consolidation using 3 reviews + +Step 4: Consolidation (multi-model-validation) + Consolidate 3 successful reviews + Notify user about 2 failures +``` + +**error-recovery + quality-gates:** + +``` +Use Case: Test-driven loop with error recovery + +Step 1: Run Tests (quality-gates TDD pattern) + Bash: bun test + +Step 2: If Test Execution Fails (error-recovery) + Error type: Syntax error in test file + + Recovery: + - Fix syntax error + - Retry test execution + - If still fails: Notify user, skip TDD phase + +Step 3: If Tests Pass (quality-gates) + Proceed to code review +``` + +**error-recovery + multi-agent-coordination:** + +``` +Use Case: Agent selection with fallback + +Step 1: Agent Selection (multi-agent-coordination) + Preferred: ui-developer-codex (external validation) + +Step 2: Check Tool Availability (error-recovery) + Bash: which claudish + Result: Not found + +Step 3: Fallback Strategy (error-recovery) + Log: "Claudish not installed, falling back to embedded ui-developer" + Use: ui-developer (embedded) + +Step 4: Execution (multi-agent-coordination) + Task: ui-developer +``` + +--- + +## Best Practices + +**Do:** +- ✅ Set timeout limits (30s default, 60s for complex tasks) +- ✅ Retry transient errors (network, 500, 503) +- ✅ Use exponential backoff (avoid hammering services) +- ✅ Skip non-retriable errors (401, 404, don't retry) +- ✅ Provide graceful degradation (fallback to embedded Claude) +- ✅ Save partial results on cancellation +- ✅ Communicate transparently (tell user what failed and why) +- ✅ Adapt to partial success (N ≥ 2 reviews is useful) + +**Don't:** +- ❌ Retry indefinitely (set max retry limits) +- ❌ Retry non-retriable errors (waste time on 401, 404) +- ❌ Fail entire workflow for single model failure (graceful degradation) +- ❌ Hide errors from user (be transparent) +- ❌ Discard partial results on failure (save what succeeded) +- ❌ Ignore user cancellation (handle SIGINT gracefully) +- ❌ Retry without delay (use backoff) + +**Performance:** +- Exponential backoff: Prevents overwhelming services +- Max retries: Limits wasted time (3 retries = <10s overhead) +- Graceful degradation: Workflows complete despite failures + +--- + +## Examples + +### Example 1: Timeout with Retry + +**Scenario:** Grok model times out, user retries with longer timeout + +**Execution:** + +``` +Attempt 1: + Bash: timeout 30s claudish --model x-ai/grok-code-fast-1 ... + Result: Timeout after 30s + + Notify user: + "⚠️ Grok timed out after 30s. + Options: + 1. Retry with 60s timeout + 2. Skip Grok + 3. Cancel workflow" + + User selects: 1 (Retry) + +Attempt 2: + Bash: timeout 60s claudish --model x-ai/grok-code-fast-1 ... + Result: Success after 45s + + Log: "Grok review completed on retry (45s)" + Write: ai-docs/grok-review.md + Continue with workflow +``` + +--- + +### Example 2: Partial Success (2/4 Models) + +**Scenario:** 4 models selected, 2 fail, proceed with 2 + +**Execution:** + +``` +Launch 4 models in parallel: + Task: Claude (embedded) + Task: Grok (external) + Task: Gemini (external) + Task: GPT-5 (external) + +Results: + Claude: Success ✓ (2 min) + Grok: Timeout ✗ (30s) + Gemini: 500 error ✗ (retry failed) + GPT-5: Success ✓ (3 min) + +successful.length = 2 (Claude, GPT-5) +2 ≥ 2 ✓ (threshold met) + +Notify user: + "2/4 models completed successfully. + + Successful: + - Claude Sonnet ✓ + - GPT-5 Codex ✓ + + Failed: + - Grok: Timeout after 30s + - Gemini: 500 Internal Server Error (retry failed) + + Proceeding with 2-model consensus." + +Consolidate: + consolidateReviews([ + "ai-docs/claude-review.md", + "ai-docs/gpt5-review.md" + ]); + +Present results with 2-model consensus +``` + +--- + +### Example 3: User Cancellation + +**Scenario:** User presses Ctrl+C during PHASE 3 + +**Execution:** + +``` +Workflow starts: + PHASE 1: Requirements ✓ (30s) + PHASE 2: Architecture ✓ (2 min) + PHASE 3: Implementation (in progress, 3 min elapsed) + +User presses Ctrl+C: + Signal: SIGINT received + +Handler executes: + Log: "User cancelled workflow (Ctrl+C)" + + Stop agents: + - backend-developer (currently executing) + - Terminate gracefully + + Collect partial results: + - ai-docs/requirements.md ✓ + - ai-docs/architecture-plan.md ✓ + - src/auth.ts (30% complete) + + Save to file: + Write: ai-docs/partial-implementation.md + "# Workflow Cancelled + Completed: PHASE 1, PHASE 2 + Partial: PHASE 3 (30%) + Pending: PHASE 4, PHASE 5" + + Notify user: + "⚠️ Workflow cancelled by user. + Partial results saved to ai-docs/partial-implementation.md + Completed: 2/5 phases (40%)" + + Exit: process.exit(0) +``` + +--- + +## Troubleshooting + +**Problem: Workflow fails after single model timeout** + +Cause: No graceful degradation + +Solution: Continue with remaining models + +``` +❌ Wrong: + if (timeout) { + throw new Error("Model timed out"); + } + +✅ Correct: + if (timeout) { + log("Model timed out, skipping"); + skipModel(); + continueWithRemaining(); + } +``` + +--- + +**Problem: Retrying 401 errors indefinitely** + +Cause: Retrying non-retriable errors + +Solution: Check if error is retriable + +``` +❌ Wrong: + for (let i = 0; i < 10; i++) { + try { return await fn(); } + catch (e) { /* retry all errors */ } + } + +✅ Correct: + for (let i = 0; i < 3; i++) { + try { return await fn(); } + catch (e) { + if (!isRetriable(e)) throw e; // Don't retry 401 + await sleep(delay); + } + } +``` + +--- + +**Problem: No visibility into what failed** + +Cause: Not communicating errors to user + +Solution: Transparently report all failures + +``` +❌ Wrong: + "Review complete!" (hides 2 failures) + +✅ Correct: + "Review complete (2/4 models succeeded). + Failed: Grok (timeout), Gemini (500 error)" +``` + +--- + +## Summary + +Error recovery ensures resilient workflows through: + +- **Timeout handling** (detect, retry with longer timeout, or skip) +- **API failure recovery** (retry transient, skip permanent) +- **Partial success strategies** (N ≥ 2 threshold, adapt to failures) +- **User cancellation** (graceful Ctrl+C, save partial results) +- **Missing tools** (claudish not installed, fallback to embedded) +- **Out of credits** (402 error, fallback to free models) +- **Retry strategies** (exponential backoff, max 3 retries) + +With these patterns, workflows are **production-ready** and **resilient** to inevitable failures. + +--- + +**Extracted From:** +- `/review` command error handling (external model failures) +- `/implement` command PHASE 2.5 (test-driven loop error recovery) +- Production experience with Claudish proxy failures +- Multi-model validation resilience requirements diff --git a/skills/multi-agent-coordination/SKILL.md b/skills/multi-agent-coordination/SKILL.md new file mode 100644 index 0000000..65eb979 --- /dev/null +++ b/skills/multi-agent-coordination/SKILL.md @@ -0,0 +1,742 @@ +--- +name: multi-agent-coordination +description: Coordinate multiple agents in parallel or sequential workflows. Use when running agents simultaneously, delegating to sub-agents, switching between specialized agents, or managing agent selection. Trigger keywords - "parallel agents", "sequential workflow", "delegate", "multi-agent", "sub-agent", "agent switching", "task decomposition". +version: 0.1.0 +tags: [orchestration, multi-agent, parallel, sequential, delegation, coordination] +keywords: [parallel, sequential, delegate, sub-agent, agent-switching, multi-agent, task-decomposition, coordination] +--- + +# Multi-Agent Coordination + +**Version:** 1.0.0 +**Purpose:** Patterns for coordinating multiple agents in complex workflows +**Status:** Production Ready + +## Overview + +Multi-agent coordination is the foundation of sophisticated Claude Code workflows. This skill provides battle-tested patterns for orchestrating multiple specialized agents to accomplish complex tasks that are beyond the capabilities of a single agent. + +The key challenge in multi-agent systems is **dependencies**. Some tasks must execute sequentially (one agent's output feeds into another), while others can run in parallel (independent validations from different perspectives). Getting this right is the difference between a 5-minute workflow and a 15-minute one. + +This skill teaches you: +- When to run agents in **parallel** vs **sequential** +- How to **select the right agent** for each task +- How to **delegate** to sub-agents without polluting context +- How to manage **context windows** across multiple agent calls + +## Core Patterns + +### Pattern 1: Sequential vs Parallel Execution + +**When to Use Sequential:** + +Use sequential execution when there are **dependencies** between agents: +- Agent B needs Agent A's output as input +- Workflow phases must complete in order (plan → implement → test → review) +- Each agent modifies shared state (same files) + +**Example: Multi-Phase Implementation** + +``` +Phase 1: Architecture Planning + Task: api-architect + Output: ai-docs/architecture-plan.md + Wait for completion ✓ + +Phase 2: Implementation (depends on Phase 1) + Task: backend-developer + Input: Read ai-docs/architecture-plan.md + Output: src/auth.ts, src/routes.ts + Wait for completion ✓ + +Phase 3: Testing (depends on Phase 2) + Task: test-architect + Input: Read src/auth.ts, src/routes.ts + Output: tests/auth.test.ts +``` + +**When to Use Parallel:** + +Use parallel execution when agents are **independent**: +- Multiple validation perspectives (designer + tester + reviewer) +- Multiple AI models reviewing same code (Grok + Gemini + Claude) +- Multiple feature implementations in separate files + +**Example: Multi-Perspective Validation** + +``` +Single Message with Multiple Task Calls: + +Task: designer + Prompt: Validate UI against Figma design + Output: ai-docs/design-review.md +--- +Task: ui-manual-tester + Prompt: Test UI in browser for usability + Output: ai-docs/testing-report.md +--- +Task: senior-code-reviewer + Prompt: Review code quality and patterns + Output: ai-docs/code-review.md + +All three execute simultaneously (3x speedup!) +Wait for all to complete, then consolidate results. +``` + +**The 4-Message Pattern for True Parallel Execution:** + +This is **CRITICAL** for achieving true parallelism: + +``` +Message 1: Preparation (Bash Only) + - Create workspace directories + - Validate inputs + - Write context files + - NO Task calls, NO TodoWrite + +Message 2: Parallel Execution (Task Only) + - Launch ALL agents in SINGLE message + - ONLY Task tool calls + - Each Task is independent + - All execute simultaneously + +Message 3: Consolidation (Task Only) + - Launch consolidation agent + - Automatically triggered when N agents complete + +Message 4: Present Results + - Show user final consolidated results + - Include links to detailed reports +``` + +**Anti-Pattern: Mixing Tool Types Breaks Parallelism** + +``` +❌ WRONG - Executes Sequentially: + await TodoWrite({...}); // Tool 1 + await Task({...}); // Tool 2 - waits for TodoWrite + await Bash({...}); // Tool 3 - waits for Task + await Task({...}); // Tool 4 - waits for Bash + +✅ CORRECT - Executes in Parallel: + await Task({...}); // Task 1 + await Task({...}); // Task 2 + await Task({...}); // Task 3 + // All execute simultaneously +``` + +**Why Mixing Fails:** + +Claude Code sees different tool types and assumes there are dependencies between them, forcing sequential execution. Using a single tool type (all Task calls) signals that operations are independent and can run in parallel. + +--- + +### Pattern 2: Agent Selection by Task Type + +**Task Detection Logic:** + +Intelligent workflows automatically detect task type and select appropriate agents: + +``` +Task Type Detection: + +IF request mentions "API", "endpoint", "backend", "database": + → API-focused workflow + → Use: api-architect, backend-developer, test-architect + → Skip: designer, ui-developer (not relevant) + +ELSE IF request mentions "UI", "component", "design", "Figma": + → UI-focused workflow + → Use: designer, ui-developer, ui-manual-tester + → Optional: ui-developer-codex (external validation) + +ELSE IF request mentions both API and UI: + → Mixed workflow + → Use all relevant agents from both categories + → Coordinate between backend and frontend agents + +ELSE IF request mentions "test", "coverage", "bug": + → Testing-focused workflow + → Use: test-architect, ui-manual-tester + → Optional: codebase-detective (for bug investigation) + +ELSE IF request mentions "review", "validate", "feedback": + → Review-focused workflow + → Use: senior-code-reviewer, designer, ui-developer + → Optional: external model reviewers +``` + +**Agent Capability Matrix:** + +| Task Type | Primary Agent | Secondary Agent | Optional External | +|-----------|---------------|-----------------|-------------------| +| API Implementation | backend-developer | api-architect | - | +| UI Implementation | ui-developer | designer | ui-developer-codex | +| Testing | test-architect | ui-manual-tester | - | +| Code Review | senior-code-reviewer | - | codex-code-reviewer | +| Architecture Planning | api-architect OR frontend-architect | - | plan-reviewer | +| Bug Investigation | codebase-detective | test-architect | - | +| Design Validation | designer | ui-developer | designer-codex | + +**Agent Switching Pattern:** + +Some workflows benefit from **adaptive agent selection** based on context: + +``` +Example: UI Development with External Validation + +Base Implementation: + Task: ui-developer + Prompt: Implement navbar component from design + +User requests external validation: + → Switch to ui-developer-codex OR add parallel ui-developer-codex + → Run both: embedded ui-developer + external ui-developer-codex + → Consolidate feedback from both + +Scenario 1: User wants speed + → Use ONLY ui-developer (embedded, fast) + +Scenario 2: User wants highest quality + → Use BOTH ui-developer AND ui-developer-codex (parallel) + → Consensus analysis on feedback + +Scenario 3: User is out of credits + → Fallback to ui-developer only + → Notify user external validation unavailable +``` + +--- + +### Pattern 3: Sub-Agent Delegation + +**File-Based Instructions (Context Isolation):** + +When delegating to sub-agents, use **file-based instructions** to avoid context pollution: + +``` +✅ CORRECT - File-Based Delegation: + +Step 1: Write instructions to file + Write: ai-docs/architecture-instructions.md + Content: "Design authentication system with JWT tokens..." + +Step 2: Delegate to agent with file reference + Task: api-architect + Prompt: "Read instructions from ai-docs/architecture-instructions.md + and create architecture plan." + +Step 3: Agent reads file, does work, writes output + Agent reads: ai-docs/architecture-instructions.md + Agent writes: ai-docs/architecture-plan.md + +Step 4: Agent returns brief summary ONLY + Return: "Architecture plan complete. See ai-docs/architecture-plan.md" + +Step 5: Orchestrator reads output file if needed + Read: ai-docs/architecture-plan.md + (Only if orchestrator needs to process the output) +``` + +**Why File-Based?** + +- **Avoids context pollution:** Long user requirements don't bloat orchestrator context +- **Reusable:** Multiple agents can read same instruction file +- **Debuggable:** Files persist after workflow completes +- **Clean separation:** Input file, output file, orchestrator stays lightweight + +**Anti-Pattern: Inline Delegation** + +``` +❌ WRONG - Context Pollution: + +Task: api-architect + Prompt: "Design authentication system with: + - JWT tokens with refresh token rotation + - Email/password login with bcrypt hashing + - OAuth2 integration with Google, GitHub + - Rate limiting on login endpoint (5 attempts per 15 min) + - Password reset flow with time-limited tokens + - Email verification on signup + - Role-based access control (admin, user, guest) + - Session management with Redis + - Security headers (CORS, CSP, HSTS) + - ... (500 more lines of requirements)" + +Problem: Orchestrator's context now contains 500+ lines of requirements + that are only relevant to the architect agent. +``` + +**Brief Summary Returns:** + +Sub-agents should return **2-5 sentence summaries**, not full output: + +``` +✅ CORRECT - Brief Summary: + "Architecture plan complete. Designed 3-layer authentication: + JWT with refresh tokens, OAuth2 integration (Google/GitHub), + and Redis session management. See ai-docs/architecture-plan.md + for detailed component breakdown." + +❌ WRONG - Full Output: + "Architecture plan: + [500 lines of detailed architecture documentation] + Components: AuthController, TokenService, OAuthService... + [another 500 lines]" +``` + +**Proxy Mode Invocation:** + +For external AI models (Claudish), use the PROXY_MODE directive: + +``` +Task: codex-code-reviewer PROXY_MODE: x-ai/grok-code-fast-1 + Prompt: "Review authentication implementation for security issues. + Code context in ai-docs/code-review-context.md" + +Agent Behavior: + 1. Detects PROXY_MODE directive + 2. Extracts model: x-ai/grok-code-fast-1 + 3. Extracts task: "Review authentication implementation..." + 4. Executes: claudish --model x-ai/grok-code-fast-1 --stdin <<< "..." + 5. Waits for full response (blocking execution) + 6. Writes: ai-docs/grok-review.md (full detailed review) + 7. Returns: "Grok review complete. Found 3 CRITICAL issues. See ai-docs/grok-review.md" +``` + +**Key: Blocking Execution** + +External models MUST execute synchronously (blocking) so the agent waits for the full response: + +``` +✅ CORRECT - Blocking: + RESULT=$(claudish --model x-ai/grok-code-fast-1 --stdin <<< "$PROMPT") + echo "$RESULT" > ai-docs/grok-review.md + echo "Review complete - see ai-docs/grok-review.md" + +❌ WRONG - Background (returns before completion): + claudish --model x-ai/grok-code-fast-1 --stdin <<< "$PROMPT" & + echo "Review started..." # Agent returns immediately, review not done! +``` + +--- + +### Pattern 4: Context Window Management + +**When to Delegate:** + +Delegate to sub-agents when: +- Task is self-contained (clear input → output) +- Output is large (architecture plan, test suite, review report) +- Task requires specialized expertise (designer, tester, reviewer) +- Multiple independent tasks can run in parallel + +**When to Execute in Main Context:** + +Execute in main orchestrator when: +- Task is small (simple file edit, command execution) +- Output is brief (yes/no decision, status check) +- Task depends on orchestrator state (current phase, iteration count) +- Context pollution risk is low + +**Context Size Estimation:** + +**Note:** Token estimates below are approximations based on typical usage. Actual context consumption varies by skill complexity, Claude model version, and conversation history. Use these as guidelines, not exact measurements. + +Estimate context usage to decide delegation strategy: + +``` +Context Budget: ~200k tokens (Claude Sonnet 4.5 - actual varies by model) + +Current context usage breakdown: + - System prompt: 10k tokens + - Skill content (5 skills): 10k tokens + - Command instructions: 5k tokens + - User request: 1k tokens + - Conversation history: 20k tokens + ─────────────────────────────────── + Total used: 46k tokens + Remaining: 154k tokens + +Safe threshold for delegation: If task will consume >30k tokens, delegate + +Example: Architecture planning for large system + - Requirements: 5k tokens + - Expected output: 20k tokens + - Total: 25k tokens + ─────────────────────────────────── + Decision: Delegate (keeps orchestrator lightweight) +``` + +**Delegation Strategy by Context Size:** + +| Task Output Size | Strategy | +|------------------|----------| +| < 1k tokens | Execute in orchestrator | +| 1k - 10k tokens | Delegate with summary return | +| 10k - 30k tokens | Delegate with file-based output | +| > 30k tokens | Multi-agent decomposition | + +**Example: Multi-Agent Decomposition** + +``` +User Request: "Implement complete e-commerce system" + +This is >100k tokens if done by single agent. Decompose: + +Phase 1: Break into sub-systems + - Product catalog + - Shopping cart + - Checkout flow + - User authentication + - Order management + - Payment integration + +Phase 2: Delegate each sub-system to separate agent + Task: backend-developer + Instruction file: ai-docs/product-catalog-requirements.md + Output file: ai-docs/product-catalog-implementation.md + + Task: backend-developer + Instruction file: ai-docs/shopping-cart-requirements.md + Output file: ai-docs/shopping-cart-implementation.md + + ... (6 parallel agent invocations) + +Phase 3: Integration agent + Task: backend-developer + Instruction: "Integrate 6 sub-systems. Read output files: + ai-docs/*-implementation.md" + Output: ai-docs/integration-plan.md + +Total context per agent: ~20k tokens (manageable) +vs. Single agent: 120k+ tokens (context overflow risk) +``` + +--- + +## Integration with Other Skills + +**multi-agent-coordination + multi-model-validation:** + +``` +Use Case: Code review with multiple AI models + +Step 1: Agent Selection (multi-agent-coordination) + - Detect task type: Code review + - Select agents: senior-code-reviewer (embedded) + external models + +Step 2: Parallel Execution (multi-model-validation) + - Follow 4-Message Pattern + - Launch all reviewers simultaneously + - Wait for all to complete + +Step 3: Consolidation (multi-model-validation) + - Auto-consolidate reviews + - Apply consensus analysis +``` + +**multi-agent-coordination + quality-gates:** + +``` +Use Case: Iterative UI validation + +Step 1: Agent Selection (multi-agent-coordination) + - Detect task type: UI validation + - Select agents: designer, ui-developer + +Step 2: Iteration Loop (quality-gates) + - Run designer validation + - If not PASS: delegate to ui-developer for fixes + - Loop until PASS or max iterations + +Step 3: User Validation Gate (quality-gates) + - MANDATORY user approval + - Collect feedback if issues found +``` + +**multi-agent-coordination + todowrite-orchestration:** + +``` +Use Case: Multi-phase implementation workflow + +Step 1: Initialize TodoWrite (todowrite-orchestration) + - Create task list for all phases + +Step 2: Sequential Agent Delegation (multi-agent-coordination) + - Phase 1: api-architect + - Phase 2: backend-developer (depends on Phase 1) + - Phase 3: test-architect (depends on Phase 2) + - Update TodoWrite after each phase +``` + +--- + +## Best Practices + +**Do:** +- ✅ Use parallel execution for independent tasks (3-5x speedup) +- ✅ Use sequential execution when there are dependencies +- ✅ Use file-based instructions to avoid context pollution +- ✅ Return brief summaries (2-5 sentences) from sub-agents +- ✅ Select agents based on task type (API/UI/Testing/Review) +- ✅ Decompose large tasks into multiple sub-agent calls +- ✅ Estimate context usage before delegating + +**Don't:** +- ❌ Mix tool types in parallel execution (breaks parallelism) +- ❌ Inline long instructions in Task prompts (context pollution) +- ❌ Return full output from sub-agents (use files instead) +- ❌ Use parallel execution for dependent tasks (wrong results) +- ❌ Use single agent for >100k token tasks (context overflow) +- ❌ Forget to wait for all parallel tasks before consolidating + +**Performance Tips:** +- Parallel execution: 3-5x faster than sequential (5min vs 15min) +- File-based delegation: Saves 50-80% context usage +- Agent switching: Adapt to user preferences (speed vs quality) +- Context decomposition: Enables tasks that would otherwise overflow + +--- + +## Examples + +### Example 1: Parallel Multi-Model Code Review + +**Scenario:** User requests "Review my authentication code with Grok and Gemini" + +**Agent Selection:** +- Task type: Code review +- Agents: senior-code-reviewer (embedded), external Grok, external Gemini + +**Execution:** + +``` +Message 1: Preparation + - Write code context to ai-docs/code-review-context.md + +Message 2: Parallel Execution (3 Task calls in single message) + Task: senior-code-reviewer + Prompt: "Review ai-docs/code-review-context.md for security issues" + --- + Task: codex-code-reviewer PROXY_MODE: x-ai/grok-code-fast-1 + Prompt: "Review ai-docs/code-review-context.md for security issues" + --- + Task: codex-code-reviewer PROXY_MODE: google/gemini-2.5-flash + Prompt: "Review ai-docs/code-review-context.md for security issues" + + All 3 execute simultaneously (3x faster than sequential) + +Message 3: Auto-Consolidation + Task: senior-code-reviewer + Prompt: "Consolidate 3 reviews from: + - ai-docs/claude-review.md + - ai-docs/grok-review.md + - ai-docs/gemini-review.md + Prioritize by consensus." + +Message 4: Present Results + "Review complete. 3 models analyzed your code. + Top 5 issues by consensus: + 1. [UNANIMOUS] Missing input validation on login endpoint + 2. [STRONG] SQL injection risk in user query + 3. [MAJORITY] Weak password requirements + See ai-docs/consolidated-review.md for details." +``` + +**Result:** 5 minutes total (vs 15+ if sequential), consensus-based prioritization + +--- + +### Example 2: Sequential Multi-Phase Implementation + +**Scenario:** User requests "Implement payment integration feature" + +**Agent Selection:** +- Task type: API implementation +- Agents: api-architect → backend-developer → test-architect → senior-code-reviewer + +**Execution:** + +``` +Phase 1: Architecture Planning + Write: ai-docs/payment-requirements.md + "Integrate Stripe payment processing with webhook support..." + + Task: api-architect + Prompt: "Read ai-docs/payment-requirements.md + Create architecture plan" + Output: ai-docs/payment-architecture.md + Return: "Architecture plan complete. Designed 3-layer payment system." + + Wait for completion ✓ + +Phase 2: Implementation (depends on Phase 1) + Task: backend-developer + Prompt: "Read ai-docs/payment-architecture.md + Implement payment integration" + Output: src/payment.ts, src/webhooks.ts + Return: "Payment integration implemented. 2 new files, 500 lines." + + Wait for completion ✓ + +Phase 3: Testing (depends on Phase 2) + Task: test-architect + Prompt: "Write tests for src/payment.ts and src/webhooks.ts" + Output: tests/payment.test.ts, tests/webhooks.test.ts + Return: "Test suite complete. 20 tests covering payment flows." + + Wait for completion ✓ + +Phase 4: Code Review (depends on Phase 3) + Task: senior-code-reviewer + Prompt: "Review payment integration implementation" + Output: ai-docs/payment-review.md + Return: "Review complete. 2 MEDIUM issues found." + + Wait for completion ✓ +``` + +**Result:** Sequential execution ensures each phase has correct inputs + +--- + +### Example 3: Adaptive Agent Switching + +**Scenario:** User requests "Validate navbar implementation" with optional external AI + +**Agent Selection:** +- Task type: UI validation +- Base agent: designer +- Optional: designer-codex (if user wants external validation) + +**Execution:** + +``` +Step 1: Ask user preference + "Do you want external AI validation? (Yes/No)" + +Step 2a: If user says NO (speed mode) + Task: designer + Prompt: "Validate navbar against Figma design" + Output: ai-docs/design-review.md + Return: "Design validation complete. PASS with 2 minor suggestions." + +Step 2b: If user says YES (quality mode) + Message 1: Parallel Validation + Task: designer + Prompt: "Validate navbar against Figma design" + --- + Task: designer PROXY_MODE: design-review-codex + Prompt: "Validate navbar against Figma design" + + Message 2: Consolidate + Task: designer + Prompt: "Consolidate 2 design reviews. Prioritize by consensus." + Output: ai-docs/design-review-consolidated.md + Return: "Consolidated review complete. Both agree on 1 CRITICAL issue." + +Step 3: User validation + Present consolidated review to user for approval +``` + +**Result:** Adaptive workflow based on user preference (speed vs quality) + +--- + +## Troubleshooting + +**Problem: Parallel tasks executing sequentially** + +Cause: Mixed tool types in same message + +Solution: Use 4-Message Pattern with ONLY Task calls in Message 2 + +``` +❌ Wrong: + await TodoWrite({...}); + await Task({...}); + await Task({...}); + +✅ Correct: + Message 1: await Bash({...}); (prep only) + Message 2: await Task({...}); await Task({...}); (parallel) +``` + +--- + +**Problem: Orchestrator context overflowing** + +Cause: Inline instructions or full output returns + +Solution: Use file-based delegation + brief summaries + +``` +❌ Wrong: + Task: agent + Prompt: "[1000 lines of inline requirements]" + Return: "[500 lines of full output]" + +✅ Correct: + Write: ai-docs/requirements.md + Task: agent + Prompt: "Read ai-docs/requirements.md" + Return: "Complete. See ai-docs/output.md" +``` + +--- + +**Problem: Wrong agent selected for task** + +Cause: Task type detection failed + +Solution: Explicitly detect task type using keywords + +``` +Check user request for keywords: + - API/endpoint/backend → api-architect, backend-developer + - UI/component/design → designer, ui-developer + - test/coverage → test-architect + - review/validate → senior-code-reviewer + +Default: Ask user to clarify task type +``` + +--- + +**Problem: Agent returns immediately before external model completes** + +Cause: Background execution (non-blocking claudish call) + +Solution: Use synchronous (blocking) execution + +``` +❌ Wrong: + claudish --model grok ... & (background, returns immediately) + +✅ Correct: + RESULT=$(claudish --model grok ...) (blocks until complete) +``` + +--- + +## Summary + +Multi-agent coordination is about choosing the right execution strategy: + +- **Parallel** when tasks are independent (3-5x speedup) +- **Sequential** when tasks have dependencies (correct results) +- **File-based delegation** to avoid context pollution (50-80% savings) +- **Brief summaries** from sub-agents (clean orchestrator context) +- **Task type detection** for intelligent agent selection +- **Context decomposition** for large tasks (avoid overflow) + +Master these patterns and you can orchestrate workflows of any complexity. + +--- + +**Extracted From:** +- `/implement` command (task detection, sequential workflows) +- `/validate-ui` command (adaptive agent switching) +- `/review` command (parallel execution, 4-Message Pattern) +- `CLAUDE.md` Parallel Multi-Model Execution Protocol diff --git a/skills/multi-model-validation/SKILL.md b/skills/multi-model-validation/SKILL.md new file mode 100644 index 0000000..a8c8392 --- /dev/null +++ b/skills/multi-model-validation/SKILL.md @@ -0,0 +1,1005 @@ +--- +name: multi-model-validation +description: Run multiple AI models in parallel for 3-5x speedup. Use when validating with Grok, Gemini, GPT-5, DeepSeek, or Claudish proxy for code review, consensus analysis, or multi-expert validation. Trigger keywords - "grok", "gemini", "gpt-5", "deepseek", "claudish", "multiple models", "parallel review", "external AI", "consensus", "multi-model". +version: 0.1.0 +tags: [orchestration, claudish, parallel, consensus, multi-model, grok, gemini, external-ai] +keywords: [grok, gemini, gpt-5, deepseek, claudish, parallel, consensus, multi-model, external-ai, proxy, openrouter] +--- + +# Multi-Model Validation + +**Version:** 1.0.0 +**Purpose:** Patterns for running multiple AI models in parallel via Claudish proxy +**Status:** Production Ready + +## Overview + +Multi-model validation is the practice of running multiple AI models (Grok, Gemini, GPT-5, DeepSeek, etc.) in parallel to validate code, designs, or implementations from different perspectives. This achieves: + +- **3-5x speedup** via parallel execution (15 minutes → 5 minutes) +- **Consensus-based prioritization** (issues flagged by all models are CRITICAL) +- **Diverse perspectives** (different models catch different issues) +- **Cost transparency** (know before you spend) + +The key innovation is the **4-Message Pattern**, which ensures true parallel execution by using only Task tool calls in a single message, avoiding the sequential execution trap caused by mixing tool types. + +This skill is extracted from the `/review` command and generalized for use in any multi-model workflow. + +## Core Patterns + +### Pattern 1: The 4-Message Pattern (MANDATORY) + +This pattern is **CRITICAL** for achieving true parallel execution with multiple AI models. + +**Why This Pattern Exists:** + +Claude Code executes tools **sequentially by default** when different tool types are mixed in the same message. To achieve true parallelism, you MUST: +1. Use ONLY one tool type per message +2. Ensure all Task calls are in a single message +3. Separate preparation (Bash) from execution (Task) from presentation + +**The Pattern:** + +``` +Message 1: Preparation (Bash Only) + - Create workspace directories + - Validate inputs (check if claudish installed) + - Write context files (code to review, design reference, etc.) + - NO Task calls + - NO TodoWrite calls + +Message 2: Parallel Execution (Task Only) + - Launch ALL AI models in SINGLE message + - ONLY Task tool calls + - Separate each Task with --- delimiter + - Each Task is independent (no dependencies) + - All execute simultaneously + +Message 3: Auto-Consolidation (Task Only) + - Automatically triggered when N ≥ 2 models complete + - Launch consolidation agent + - Pass all review file paths + - Apply consensus analysis + +Message 4: Present Results + - Show user prioritized issues + - Include consensus levels (unanimous, strong, majority) + - Link to detailed reports + - Cost summary (if applicable) +``` + +**Example: 5-Model Parallel Code Review** + +``` +Message 1: Preparation + Bash: mkdir -p ai-docs/reviews + Bash: git diff > ai-docs/code-review-context.md + Bash: which claudish (check if installed) + +Message 2: Parallel Execution + Task: senior-code-reviewer + Prompt: "Review ai-docs/code-review-context.md for security issues. + Write detailed review to ai-docs/reviews/claude-review.md + Return only brief summary." + --- + Task: codex-code-reviewer PROXY_MODE: x-ai/grok-code-fast-1 + Prompt: "Review ai-docs/code-review-context.md for security issues. + Write detailed review to ai-docs/reviews/grok-review.md + Return only brief summary." + --- + Task: codex-code-reviewer PROXY_MODE: google/gemini-2.5-flash + Prompt: "Review ai-docs/code-review-context.md for security issues. + Write detailed review to ai-docs/reviews/gemini-review.md + Return only brief summary." + --- + Task: codex-code-reviewer PROXY_MODE: openai/gpt-5-codex + Prompt: "Review ai-docs/code-review-context.md for security issues. + Write detailed review to ai-docs/reviews/gpt5-review.md + Return only brief summary." + --- + Task: codex-code-reviewer PROXY_MODE: deepseek/deepseek-coder + Prompt: "Review ai-docs/code-review-context.md for security issues. + Write detailed review to ai-docs/reviews/deepseek-review.md + Return only brief summary." + + All 5 models execute simultaneously (5x parallelism!) + +Message 3: Auto-Consolidation + (Automatically triggered - don't wait for user to request) + + Task: senior-code-reviewer + Prompt: "Consolidate 5 code reviews from: + - ai-docs/reviews/claude-review.md + - ai-docs/reviews/grok-review.md + - ai-docs/reviews/gemini-review.md + - ai-docs/reviews/gpt5-review.md + - ai-docs/reviews/deepseek-review.md + + Apply consensus analysis: + - Issues flagged by ALL 5 → UNANIMOUS (VERY HIGH confidence) + - Issues flagged by 4 → STRONG (HIGH confidence) + - Issues flagged by 3 → MAJORITY (MEDIUM confidence) + - Issues flagged by 1-2 → DIVERGENT (LOW confidence) + + Prioritize by consensus level and severity. + Write to ai-docs/consolidated-review.md" + +Message 4: Present Results + "Multi-model code review complete! 5 AI models analyzed your code. + + Top 5 Issues (Prioritized by Consensus): + 1. [UNANIMOUS] Missing input validation on POST /api/users + 2. [UNANIMOUS] SQL injection risk in search endpoint + 3. [STRONG] Weak password hashing (bcrypt rounds too low) + 4. [MAJORITY] Missing rate limiting on authentication endpoints + 5. [MAJORITY] Insufficient error handling in payment flow + + See ai-docs/consolidated-review.md for complete analysis." +``` + +**Performance Impact:** + +- Sequential execution: 5 models × 3 min = 15 minutes +- Parallel execution: max(model times) ≈ 5 minutes +- **Speedup: 3x with perfect parallelism** + +--- + +### Pattern 2: Parallel Execution Architecture + +**Single Message, Multiple Tasks:** + +The key to parallel execution is putting ALL Task calls in a **single message** with the `---` delimiter: + +``` +✅ CORRECT - Parallel Execution: + +Task: agent1 + Prompt: "Task 1 instructions" +--- +Task: agent2 + Prompt: "Task 2 instructions" +--- +Task: agent3 + Prompt: "Task 3 instructions" + +All 3 execute simultaneously. +``` + +**Anti-Pattern: Sequential Execution** + +``` +❌ WRONG - Sequential Execution: + +Message 1: + Task: agent1 + +Message 2: + Task: agent2 + +Message 3: + Task: agent3 + +Each task waits for previous to complete (3x slower). +``` + +**Independent Tasks Requirement:** + +Each Task must be **independent** (no dependencies): + +``` +✅ CORRECT - Independent: + Task: review code for security + Task: review code for performance + Task: review code for style + + All can run simultaneously (same input, different perspectives). + +❌ WRONG - Dependent: + Task: implement feature + Task: write tests for feature (depends on implementation) + Task: review implementation (depends on tests) + + Must run sequentially (each needs previous output). +``` + +**Unique Output Files:** + +Each Task MUST write to a **unique output file** to avoid conflicts: + +``` +✅ CORRECT - Unique Files: + Task: reviewer1 → ai-docs/reviews/claude-review.md + Task: reviewer2 → ai-docs/reviews/grok-review.md + Task: reviewer3 → ai-docs/reviews/gemini-review.md + +❌ WRONG - Shared File: + Task: reviewer1 → ai-docs/review.md + Task: reviewer2 → ai-docs/review.md (overwrites reviewer1!) + Task: reviewer3 → ai-docs/review.md (overwrites reviewer2!) +``` + +**Wait for All Before Consolidation:** + +Do NOT consolidate until ALL tasks complete: + +``` +✅ CORRECT - Wait for All: + Launch: Task1, Task2, Task3, Task4 (parallel) + Wait: All 4 complete + Check: results.filter(r => r.status === 'fulfilled').length + If >= 2: Proceed with consolidation + If < 2: Offer retry or abort + +❌ WRONG - Premature Consolidation: + Launch: Task1, Task2, Task3, Task4 + After 30s: Task1, Task2 done + Consolidate: Only Task1 + Task2 (Task3, Task4 still running!) +``` + +--- + +### Pattern 3: Proxy Mode Implementation + +**PROXY_MODE Directive:** + +External AI models are invoked via the PROXY_MODE directive in agent prompts: + +``` +Task: codex-code-reviewer PROXY_MODE: x-ai/grok-code-fast-1 + Prompt: "Review code for security issues..." +``` + +**Agent Behavior:** + +When an agent sees PROXY_MODE, it: + +``` +1. Detects PROXY_MODE directive in incoming prompt +2. Extracts model name (e.g., "x-ai/grok-code-fast-1") +3. Extracts actual task (everything after PROXY_MODE line) +4. Constructs claudish command: + printf '%s' "AGENT_PROMPT" | claudish --model x-ai/grok-code-fast-1 --stdin --quiet --auto-approve +5. Executes SYNCHRONOUSLY (blocking, waits for full response) +6. Captures full output +7. Writes detailed results to file (ai-docs/grok-review.md) +8. Returns BRIEF summary only (2-5 sentences) +``` + +**Critical: Blocking Execution** + +External model calls MUST be **synchronous (blocking)** so the agent waits for completion: + +``` +✅ CORRECT - Blocking (Synchronous): + RESULT=$(printf '%s' "$PROMPT" | claudish --model grok --stdin --quiet --auto-approve) + echo "$RESULT" > ai-docs/grok-review.md + echo "Grok review complete. See ai-docs/grok-review.md" + +❌ WRONG - Background (Asynchronous): + printf '%s' "$PROMPT" | claudish --model grok --stdin --quiet --auto-approve & + echo "Grok review started..." # Agent returns immediately, review not done! +``` + +**Why Blocking Matters:** + +If agents return before external models complete, the orchestrator will: +- Think all reviews are done (they're not) +- Try to consolidate partial results (missing data) +- Present incomplete results to user (bad experience) + +**Output Strategy:** + +Agents write **full detailed output to file** and return **brief summary only**: + +``` +Full Output (ai-docs/grok-review.md): + "# Code Review by Grok + + ## Security Issues + + ### CRITICAL: SQL Injection in User Search + The search endpoint constructs SQL queries using string concatenation... + [500 more lines of detailed analysis]" + +Brief Summary (returned to orchestrator): + "Grok review complete. Found 3 CRITICAL, 5 HIGH, 12 MEDIUM issues. + See ai-docs/grok-review.md for details." +``` + +**Why Brief Summaries:** + +- Orchestrator doesn't need full 500-line review in context +- Full review is in file for consolidation agent +- Keeps orchestrator context clean (context efficiency) + +**Auto-Approve Flag:** + +Use `--auto-approve` flag to prevent interactive prompts: + +``` +✅ CORRECT - Auto-Approve: + claudish --model grok --stdin --quiet --auto-approve + +❌ WRONG - Interactive (blocks waiting for user input): + claudish --model grok --stdin --quiet + # Waits for user to approve costs... but this is inside an agent! +``` + +--- + +### Pattern 4: Cost Estimation and Transparency + +**Input/Output Token Separation:** + +Provide separate estimates for input and output tokens: + +``` +Cost Estimation for Multi-Model Review: + +Input Tokens (per model): + - Code context: 500 lines × 1.5 = 750 tokens + - Review instructions: 200 tokens + - Total input per model: ~1000 tokens + - Total input (5 models): 5,000 tokens + +Output Tokens (per model): + - Expected output: 2,000 - 4,000 tokens + - Total output (5 models): 10,000 - 20,000 tokens + +Cost Calculation (example rates): + - Input: 5,000 tokens × $0.0001/1k = $0.0005 + - Output: 15,000 tokens × $0.0005/1k = $0.0075 (3-5x more expensive) + - Total: $0.0080 (range: $0.0055 - $0.0105) + +User Approval Gate: + "Multi-model review will cost approximately $0.008 ($0.005 - $0.010). + Proceed? (Yes/No)" +``` + +**Input Token Estimation Formula:** + +``` +Input Tokens = (Code Lines × 1.5) + Instruction Tokens + +Why 1.5x multiplier? + - Code lines: ~1 token per line (average) + - Context overhead: +50% (imports, comments, whitespace) + +Example: + 500 lines of code → 500 × 1.5 = 750 tokens + + 200 instruction tokens = 950 tokens total input +``` + +**Output Token Estimation Formula:** + +``` +Output Tokens = Base Estimate + Complexity Factor + +Base Estimates by Task Type: + - Code review: 2,000 - 4,000 tokens + - Design validation: 1,000 - 2,000 tokens + - Architecture planning: 3,000 - 6,000 tokens + - Bug investigation: 2,000 - 5,000 tokens + +Complexity Factors: + - Simple (< 100 lines code): Use low end of range + - Medium (100-500 lines): Use mid-range + - Complex (> 500 lines): Use high end of range + +Example: + 400 lines of complex code → 4,000 tokens (high complexity) + 50 lines of simple code → 2,000 tokens (low complexity) +``` + +**Range-Based Estimates:** + +Always provide a **range** (min-max), not a single number: + +``` +✅ CORRECT - Range: + "Estimated cost: $0.005 - $0.010 (depends on review depth)" + +❌ WRONG - Single Number: + "Estimated cost: $0.0075" + (User surprised when actual is $0.0095) +``` + +**Why Output Costs More:** + +Output tokens are typically **3-5x more expensive** than input tokens: + +``` +Example Pricing (OpenRouter): + - Grok: $0.50 / 1M input, $1.50 / 1M output (3x difference) + - Gemini Flash: $0.10 / 1M input, $0.40 / 1M output (4x difference) + - GPT-5 Codex: $1.00 / 1M input, $5.00 / 1M output (5x difference) + +Impact: + If input = 5,000 tokens, output = 15,000 tokens: + Input cost: $0.0005 + Output cost: $0.0075 (15x higher despite only 3x more tokens) + Total: $0.0080 (94% is output!) +``` + +**User Approval Before Execution:** + +ALWAYS ask for user approval before expensive operations: + +``` +Present to user: + "You selected 5 AI models for code review: + - Claude Sonnet (embedded, free) + - Grok Code Fast (external, $0.002) + - Gemini 2.5 Flash (external, $0.001) + - GPT-5 Codex (external, $0.004) + - DeepSeek Coder (external, $0.001) + + Estimated total cost: $0.008 ($0.005 - $0.010) + + Proceed with multi-model review? (Yes/No)" + +If user says NO: + Offer alternatives: + 1. Use only free embedded Claude + 2. Select fewer models + 3. Cancel review + +If user says YES: + Proceed with Message 2 (parallel execution) +``` + +--- + +### Pattern 5: Auto-Consolidation Logic + +**Automatic Trigger:** + +Consolidation should happen **automatically** when N ≥ 2 reviews complete: + +``` +✅ CORRECT - Auto-Trigger: + +const results = await Promise.allSettled([task1, task2, task3, task4, task5]); +const successful = results.filter(r => r.status === 'fulfilled'); + +if (successful.length >= 2) { + // Auto-trigger consolidation (DON'T wait for user to ask) + const consolidated = await Task({ + subagent_type: "senior-code-reviewer", + description: "Consolidate reviews", + prompt: `Consolidate ${successful.length} reviews and apply consensus analysis` + }); + + return formatResults(consolidated); +} else { + // Too few successful reviews + notifyUser("Only 1 model succeeded. Retry failures or abort?"); +} + +❌ WRONG - Wait for User: + +const results = await Promise.allSettled([...]); +const successful = results.filter(r => r.status === 'fulfilled'); + +// Present results to user +notifyUser("3 reviews complete. Would you like me to consolidate them?"); +// Waits for user to request consolidation... +``` + +**Why Auto-Trigger:** + +- Better UX (no extra user prompt needed) +- Faster workflow (no wait for user response) +- Expected behavior (user assumes consolidation is part of workflow) + +**Minimum Threshold:** + +Require **at least 2 successful reviews** for meaningful consensus: + +``` +if (successful.length >= 2) { + // Proceed with consolidation +} else if (successful.length === 1) { + // Only 1 review succeeded + notifyUser("Only 1 model succeeded. No consensus available. See single review or retry?"); +} else { + // All failed + notifyUser("All models failed. Check logs and retry?"); +} +``` + +**Pass All Review File Paths:** + +Consolidation agent needs paths to ALL review files: + +``` +Task: senior-code-reviewer + Prompt: "Consolidate reviews from these files: + - ai-docs/reviews/claude-review.md + - ai-docs/reviews/grok-review.md + - ai-docs/reviews/gemini-review.md + + Apply consensus analysis and prioritize issues." +``` + +**Don't Inline Full Reviews:** + +``` +❌ WRONG - Inline Reviews (context pollution): + Prompt: "Consolidate these reviews: + + Claude Review: + [500 lines of review content] + + Grok Review: + [500 lines of review content] + + Gemini Review: + [500 lines of review content]" + +✅ CORRECT - File Paths: + Prompt: "Read and consolidate reviews from: + - ai-docs/reviews/claude-review.md + - ai-docs/reviews/grok-review.md + - ai-docs/reviews/gemini-review.md" +``` + +--- + +### Pattern 6: Consensus Analysis + +**Consensus Levels:** + +Classify issues by how many models flagged them: + +``` +Consensus Levels (for N models): + +UNANIMOUS (100% agreement): + - All N models flagged this issue + - VERY HIGH confidence + - MUST FIX priority + +STRONG CONSENSUS (67-99% agreement): + - Most models flagged this issue (⌈2N/3⌉ to N-1) + - HIGH confidence + - RECOMMENDED priority + +MAJORITY (50-66% agreement): + - Half or more models flagged this issue (⌈N/2⌉ to ⌈2N/3⌉-1) + - MEDIUM confidence + - CONSIDER priority + +DIVERGENT (< 50% agreement): + - Only 1-2 models flagged this issue + - LOW confidence + - OPTIONAL priority (may be model-specific perspective) +``` + +**Example: 5 Models** + +``` +Issue Flagged By: Consensus Level: Priority: +───────────────────────────────────────────────────────────── +All 5 models UNANIMOUS (100%) MUST FIX +4 models STRONG (80%) RECOMMENDED +3 models MAJORITY (60%) CONSIDER +2 models DIVERGENT (40%) OPTIONAL +1 model DIVERGENT (20%) OPTIONAL +``` + +**Keyword-Based Matching (v1.0):** + +Simple consensus analysis using keyword matching: + +``` +Algorithm: + +1. Extract issues from each review +2. For each unique issue: + a. Identify keywords (e.g., "SQL injection", "input validation") + b. Check which other reviews mention same keywords + c. Count models that flagged this issue + d. Assign consensus level + +Example: + +Claude Review: "Missing input validation on POST /api/users" +Grok Review: "Input validation absent in user creation endpoint" +Gemini Review: "No validation for user POST endpoint" + +Keywords: ["input validation", "POST", "/api/users", "user"] +Match: All 3 reviews mention these keywords +Consensus: UNANIMOUS (3/3 = 100%) +``` + +**Model Agreement Matrix:** + +Show which models agree on which issues: + +``` +Issue Matrix: + +Issue Claude Grok Gemini GPT-5 DeepSeek Consensus +────────────────────────────────────────────────────────────────────────────────── +SQL injection in search ✓ ✓ ✓ ✓ ✓ UNANIMOUS +Missing input validation ✓ ✓ ✓ ✓ ✗ STRONG +Weak password hashing ✓ ✓ ✓ ✗ ✗ MAJORITY +Missing rate limiting ✓ ✓ ✗ ✗ ✗ DIVERGENT +Insufficient error handling ✓ ✗ ✗ ✗ ✗ DIVERGENT +``` + +**Prioritized Issue List:** + +Sort issues by consensus level, then by severity: + +``` +Top 10 Issues (Prioritized): + +1. [UNANIMOUS - CRITICAL] SQL injection in search endpoint + Flagged by: Claude, Grok, Gemini, GPT-5, DeepSeek (5/5) + +2. [UNANIMOUS - HIGH] Missing input validation on POST /api/users + Flagged by: Claude, Grok, Gemini, GPT-5, DeepSeek (5/5) + +3. [STRONG - HIGH] Weak password hashing (bcrypt rounds too low) + Flagged by: Claude, Grok, Gemini, GPT-5 (4/5) + +4. [STRONG - MEDIUM] Missing rate limiting on auth endpoints + Flagged by: Claude, Grok, Gemini, GPT-5 (4/5) + +5. [MAJORITY - MEDIUM] Insufficient error handling in payment flow + Flagged by: Claude, Grok, Gemini (3/5) + +... (remaining issues) +``` + +**Future Enhancement (v1.1+): Semantic Similarity** + +``` +Instead of keyword matching, use semantic similarity: + - Embed issue descriptions with sentence-transformers + - Calculate cosine similarity between embeddings + - Issues with >0.8 similarity are "same issue" + - More accurate consensus detection +``` + +--- + +## Integration with Other Skills + +**multi-model-validation + quality-gates:** + +``` +Use Case: Cost approval before expensive multi-model review + +Step 1: Cost Estimation (multi-model-validation) + Calculate input/output tokens + Estimate cost range + +Step 2: User Approval Gate (quality-gates) + Present cost estimate + Ask user for approval + If NO: Offer alternatives or abort + If YES: Proceed with execution + +Step 3: Parallel Execution (multi-model-validation) + Follow 4-Message Pattern + Launch all models simultaneously +``` + +**multi-model-validation + error-recovery:** + +``` +Use Case: Handling external model failures gracefully + +Step 1: Parallel Execution (multi-model-validation) + Launch 5 external models + +Step 2: Error Handling (error-recovery) + Model 1: Success + Model 2: Timeout after 30s → Skip, continue with others + Model 3: API 500 error → Retry once, then skip + Model 4: Success + Model 5: Success + +Step 3: Partial Success Strategy (error-recovery) + 3/5 models succeeded (≥ 2 threshold) + Proceed with consolidation using 3 reviews + Notify user: "2 models failed, proceeding with 3 reviews" + +Step 4: Consolidation (multi-model-validation) + Consolidate 3 successful reviews + Apply consensus analysis +``` + +**multi-model-validation + todowrite-orchestration:** + +``` +Use Case: Real-time progress tracking during parallel execution + +Step 1: Initialize TodoWrite (todowrite-orchestration) + Tasks: + 1. Prepare workspace + 2. Launch Claude review + 3. Launch Grok review + 4. Launch Gemini review + 5. Launch GPT-5 review + 6. Consolidate reviews + 7. Present results + +Step 2: Update Progress (todowrite-orchestration) + Mark tasks complete as models finish: + - Claude completes → Mark task 2 complete + - Grok completes → Mark task 3 complete + - Gemini completes → Mark task 4 complete + - GPT-5 completes → Mark task 5 complete + +Step 3: User Sees Real-Time Progress + "3/4 external models completed, 1 in progress..." +``` + +--- + +## Best Practices + +**Do:** +- ✅ Use 4-Message Pattern for true parallel execution +- ✅ Provide cost estimates BEFORE execution +- ✅ Ask user approval for costs >$0.01 +- ✅ Auto-trigger consolidation when N ≥ 2 reviews complete +- ✅ Use blocking (synchronous) claudish execution +- ✅ Write full output to files, return brief summaries +- ✅ Prioritize by consensus level (unanimous → strong → majority → divergent) +- ✅ Show model agreement matrix +- ✅ Handle partial success gracefully (some models fail) + +**Don't:** +- ❌ Mix tool types in Message 2 (breaks parallelism) +- ❌ Use background claudish execution (returns before completion) +- ❌ Wait for user to request consolidation (auto-trigger instead) +- ❌ Consolidate with < 2 successful reviews (no meaningful consensus) +- ❌ Inline full reviews in consolidation prompt (use file paths) +- ❌ Return full 500-line reviews to orchestrator (use brief summaries) +- ❌ Skip cost approval gate for expensive operations + +**Performance:** +- Parallel execution: 3-5x faster than sequential +- Message 2 speedup: 15 min → 5 min with 5 models +- Context efficiency: Brief summaries save 50-80% context + +--- + +## Examples + +### Example 1: 3-Model Code Review with Cost Approval + +**Scenario:** User requests "Review my changes with Grok and Gemini" + +**Execution:** + +``` +Message 1: Preparation + Bash: mkdir -p ai-docs/reviews + Bash: git diff > ai-docs/code-review-context.md + Bash: wc -l ai-docs/code-review-context.md + Output: 450 lines + + Calculate costs: + Input: 450 × 1.5 = 675 tokens per model + Output: 2000-4000 tokens per model + Total: 3 models × (675 input + 3000 output avg) = 11,025 tokens + Cost: ~$0.006 ($0.004 - $0.008) + +Message 2: User Approval Gate (quality-gates skill) + Present to user: + "Multi-model review will analyze 450 lines of code with 3 AI models: + - Claude Sonnet (embedded, free) + - Grok Code Fast ($0.002) + - Gemini 2.5 Flash ($0.001) + + Estimated cost: $0.006 ($0.004 - $0.008) + + Proceed? (Yes/No)" + + User: "Yes" + +Message 3: Parallel Execution (Task only) + Task: senior-code-reviewer + Prompt: "Review ai-docs/code-review-context.md. + Write to ai-docs/reviews/claude-review.md + Return brief summary." + --- + Task: codex-code-reviewer PROXY_MODE: x-ai/grok-code-fast-1 + Prompt: "Review ai-docs/code-review-context.md. + Write to ai-docs/reviews/grok-review.md + Return brief summary." + --- + Task: codex-code-reviewer PROXY_MODE: google/gemini-2.5-flash + Prompt: "Review ai-docs/code-review-context.md. + Write to ai-docs/reviews/gemini-review.md + Return brief summary." + + All 3 execute simultaneously (3x speedup) + +Message 4: Auto-Consolidation (automatic) + results.length = 3 (all succeeded) + 3 ≥ 2 ✓ (threshold met) + + Task: senior-code-reviewer + Prompt: "Consolidate 3 reviews from: + - ai-docs/reviews/claude-review.md + - ai-docs/reviews/grok-review.md + - ai-docs/reviews/gemini-review.md + + Apply consensus analysis." + Output: ai-docs/consolidated-review.md + +Message 5: Present Results + "Multi-model review complete! 3 AI models analyzed 450 lines. + + Top 5 Issues (Consensus): + 1. [UNANIMOUS] SQL injection in search endpoint + 2. [UNANIMOUS] Missing input validation on user creation + 3. [STRONG] Weak password hashing (2/3 models) + 4. [MAJORITY] Missing rate limiting (2/3 models) + 5. [DIVERGENT] Code style inconsistency (1/3 model) + + Actual cost: $0.0058 (within estimate) + See ai-docs/consolidated-review.md for details." +``` + +**Result:** 5 minutes total, consensus-based prioritization, cost transparency + +--- + +### Example 2: Partial Success with Error Recovery + +**Scenario:** 4 models selected, 2 fail + +**Execution:** + +``` +Message 1: Preparation + (same as Example 1) + +Message 2: Parallel Execution + Task: senior-code-reviewer (embedded) + Task: PROXY_MODE grok (external) + Task: PROXY_MODE gemini (external) + Task: PROXY_MODE gpt-5-codex (external) + +Message 3: Error Recovery (error-recovery skill) + results = await Promise.allSettled([...]); + + Results: + - Claude: Success ✓ + - Grok: Timeout after 30s ✗ + - Gemini: API 500 error ✗ + - GPT-5: Success ✓ + + successful.length = 2 (Claude + GPT-5) + 2 ≥ 2 ✓ (threshold met, can proceed) + + Notify user: + "2/4 models succeeded (Grok timeout, Gemini error). + Proceeding with consolidation using 2 reviews." + +Message 4: Auto-Consolidation + Task: senior-code-reviewer + Prompt: "Consolidate 2 reviews from: + - ai-docs/reviews/claude-review.md + - ai-docs/reviews/gpt5-review.md + + Note: Only 2 models (Grok and Gemini failed)." + +Message 5: Present Results + "Multi-model review complete (2/4 models succeeded). + + Top Issues (2-model consensus): + 1. [UNANIMOUS] SQL injection (both flagged) + 2. [DIVERGENT] Input validation (Claude only) + 3. [DIVERGENT] Rate limiting (GPT-5 only) + + Note: Grok and Gemini failed. Limited consensus data. + See ai-docs/consolidated-review.md for details." +``` + +**Result:** Graceful degradation, useful results despite failures + +--- + +## Troubleshooting + +**Problem: Models executing sequentially instead of parallel** + +Cause: Mixed tool types in Message 2 + +Solution: Use ONLY Task calls in Message 2 + +``` +❌ Wrong: + Message 2: + TodoWrite({...}) + Task({...}) + Task({...}) + +✅ Correct: + Message 1: TodoWrite({...}) (separate message) + Message 2: Task({...}); Task({...}) (only Task) +``` + +--- + +**Problem: Agent returns before external model completes** + +Cause: Background claudish execution + +Solution: Use synchronous (blocking) execution + +``` +❌ Wrong: + claudish --model grok ... & + +✅ Correct: + RESULT=$(claudish --model grok ...) +``` + +--- + +**Problem: Consolidation never triggers** + +Cause: Waiting for user to request it + +Solution: Auto-trigger when N ≥ 2 reviews complete + +``` +❌ Wrong: + if (results.length >= 2) { + notifyUser("Ready to consolidate. Proceed?"); + // Waits for user... + } + +✅ Correct: + if (results.length >= 2) { + // Auto-trigger, don't wait + await consolidate(); + } +``` + +--- + +**Problem: Costs higher than estimated** + +Cause: Underestimated output tokens + +Solution: Use range-based estimates, bias toward high end + +``` +✅ Better Estimation: + Output: 3,000 - 5,000 tokens (range, not single number) + Cost: $0.005 - $0.010 (gives user realistic expectation) +``` + +--- + +## Summary + +Multi-model validation achieves 3-5x speedup and consensus-based prioritization through: + +- **4-Message Pattern** (true parallel execution) +- **Blocking proxy execution** (agents wait for external models) +- **Auto-consolidation** (triggered when N ≥ 2 reviews complete) +- **Consensus analysis** (unanimous → strong → majority → divergent) +- **Cost transparency** (estimate before, report after) +- **Error recovery** (graceful degradation on failures) + +Master this skill and you can validate any implementation with multiple AI perspectives in minutes. + +--- + +**Extracted From:** +- `/review` command (complete multi-model review orchestration) +- `CLAUDE.md` Parallel Multi-Model Execution Protocol +- `shared/skills/claudish-usage` (proxy mode patterns) diff --git a/skills/quality-gates/SKILL.md b/skills/quality-gates/SKILL.md new file mode 100644 index 0000000..95013e4 --- /dev/null +++ b/skills/quality-gates/SKILL.md @@ -0,0 +1,996 @@ +--- +name: quality-gates +description: Implement quality gates, user approval, iteration loops, and test-driven development. Use when validating with users, implementing feedback loops, classifying issue severity, running test-driven loops, or building multi-iteration workflows. Trigger keywords - "approval", "user validation", "iteration", "feedback loop", "severity", "test-driven", "TDD", "quality gate", "consensus". +version: 0.1.0 +tags: [orchestration, quality-gates, approval, iteration, feedback, severity, test-driven, TDD] +keywords: [approval, validation, iteration, feedback-loop, severity, test-driven, TDD, quality-gate, consensus, user-approval] +--- + +# Quality Gates + +**Version:** 1.0.0 +**Purpose:** Patterns for approval gates, iteration loops, and quality validation in multi-agent workflows +**Status:** Production Ready + +## Overview + +Quality gates are checkpoints in workflows where execution pauses for validation before proceeding. They prevent low-quality work from advancing through the pipeline and ensure user expectations are met. + +This skill provides battle-tested patterns for: +- **User approval gates** (cost gates, quality gates, final acceptance) +- **Iteration loops** (automated refinement until quality threshold met) +- **Issue severity classification** (CRITICAL, HIGH, MEDIUM, LOW) +- **Multi-reviewer consensus** (unanimous vs majority agreement) +- **Feedback loops** (user reports issues → agent fixes → user validates) +- **Test-driven development loops** (write tests → run → analyze failures → fix → repeat) + +Quality gates transform "fire and forget" workflows into **iterative refinement systems** that consistently produce high-quality results. + +## Core Patterns + +### Pattern 1: User Approval Gates + +**When to Ask for Approval:** + +Use approval gates for: +- **Cost gates:** Before expensive operations (multi-model review, large-scale refactoring) +- **Quality gates:** Before proceeding to next phase (design validation before implementation) +- **Final validation:** Before completing workflow (user acceptance testing) +- **Irreversible operations:** Before destructive actions (delete files, database migrations) + +**How to Present Approval:** + +``` +Good Approval Prompt: + +"You selected 5 AI models for code review: + - Claude Sonnet (embedded, free) + - Grok Code Fast (external, $0.002) + - Gemini 2.5 Flash (external, $0.001) + - GPT-5 Codex (external, $0.004) + - DeepSeek Coder (external, $0.001) + + Estimated total cost: $0.008 ($0.005 - $0.010) + Expected duration: ~5 minutes + + Proceed with multi-model review? (Yes/No/Cancel)" + +Why it works: +✓ Clear context (what will happen) +✓ Cost transparency (range, not single number) +✓ Time expectation (5 minutes) +✓ Multiple options (Yes/No/Cancel) +``` + +**Anti-Pattern: Vague Approval** + +``` +❌ Wrong: + +"This will cost money. Proceed? (Yes/No)" + +Why it fails: +✗ No cost details (how much?) +✗ No context (what will happen?) +✗ No alternatives (what if user says no?) +``` + +**Handling User Responses:** + +``` +User says YES: + → Proceed with workflow + → Track approval in logs + → Continue to next step + +User says NO: + → Offer alternatives: + 1. Use fewer models (reduce cost) + 2. Use only free embedded Claude + 3. Skip this step entirely + 4. Cancel workflow + → Ask user to choose alternative + → Proceed based on choice + +User says CANCEL: + → Gracefully exit workflow + → Save partial results (if any) + → Log cancellation reason + → Clean up temporary files + → Notify user: "Workflow cancelled. Partial results saved to..." +``` + +**Approval Bypasses (Advanced):** + +For automated workflows, allow approval bypass: + +``` +Automated Workflow Mode: + +If workflow is triggered by CI/CD or scheduled task: + → Skip user approval gates + → Use predefined defaults (e.g., max cost $0.10) + → Log decisions for audit trail + → Email report to stakeholders after completion + +Example: + if (isAutomatedMode) { + if (estimatedCost <= maxAutomatedCost) { + log("Auto-approved: $0.008 <= $0.10 threshold"); + proceed(); + } else { + log("Auto-rejected: $0.008 > $0.10 threshold"); + notifyStakeholders("Cost exceeds automated threshold"); + abort(); + } + } +``` + +--- + +### Pattern 2: Iteration Loop Patterns + +**Max Iteration Limits:** + +Always set a **max iteration limit** to prevent infinite loops: + +``` +Typical Iteration Limits: + +Automated quality loops: 10 iterations + - Designer validation → Developer fixes → Repeat + - Test failures → Developer fixes → Repeat + +User feedback loops: 5 rounds + - User reports issues → Developer fixes → User validates → Repeat + +Code review loops: 3 rounds + - Reviewer finds issues → Developer fixes → Re-review → Repeat + +Multi-model consensus: 1 iteration (no loop) + - Parallel review → Consolidate → Present +``` + +**Exit Criteria:** + +Define clear **exit criteria** for each loop type: + +``` +Loop Type: Design Validation + +Exit Criteria (checked after each iteration): + 1. Designer assessment = PASS → Exit loop (success) + 2. Iteration count >= 10 → Exit loop (max iterations) + 3. User manually approves → Exit loop (user override) + 4. No changes made by developer → Exit loop (stuck, escalate) + +Example: + for (let i = 1; i <= 10; i++) { + const review = await designer.validate(); + + if (review.assessment === "PASS") { + log("Design validation passed on iteration " + i); + break; // Success exit + } + + if (i === 10) { + log("Max iterations reached. Escalating to user validation."); + break; // Max iterations exit + } + + await developer.fix(review.issues); + } +``` + +**Progress Tracking:** + +Show clear progress to user during iterations: + +``` +Iteration Loop Progress: + +Iteration 1/10: Designer found 5 issues → Developer fixing... +Iteration 2/10: Designer found 3 issues → Developer fixing... +Iteration 3/10: Designer found 1 issue → Developer fixing... +Iteration 4/10: Designer assessment: PASS ✓ + +Loop completed in 4 iterations. +``` + +**Iteration History Documentation:** + +Track what happened in each iteration: + +``` +Iteration History (ai-docs/iteration-history.md): + +## Iteration 1 +Designer Assessment: NEEDS IMPROVEMENT +Issues Found: + - Button color doesn't match design (#3B82F6 vs #2563EB) + - Spacing between elements too tight (8px vs 16px) + - Font size incorrect (14px vs 16px) +Developer Actions: + - Updated button color to #2563EB + - Increased spacing to 16px + - Changed font size to 16px + +## Iteration 2 +Designer Assessment: NEEDS IMPROVEMENT +Issues Found: + - Border radius too large (8px vs 4px) +Developer Actions: + - Reduced border radius to 4px + +## Iteration 3 +Designer Assessment: PASS ✓ +Issues Found: None +Result: Design validation complete +``` + +--- + +### Pattern 3: Issue Severity Classification + +**Severity Levels:** + +Use 4-level severity classification: + +``` +CRITICAL - Must fix immediately + - Blocks core functionality + - Security vulnerabilities (SQL injection, XSS, auth bypass) + - Data loss risk + - System crashes + - Build failures + + Action: STOP workflow, fix immediately, re-validate + +HIGH - Should fix soon + - Major bugs (incorrect behavior) + - Performance issues (>3s page load, memory leaks) + - Accessibility violations (keyboard navigation broken) + - User experience blockers + + Action: Fix in current iteration, proceed after fix + +MEDIUM - Should fix + - Minor bugs (edge cases, visual glitches) + - Code quality issues (duplication, complexity) + - Non-blocking performance issues + - Incomplete error handling + + Action: Fix if time permits, or schedule for next iteration + +LOW - Nice to have + - Code style inconsistencies + - Minor refactoring opportunities + - Documentation improvements + - Polish and optimization + + Action: Log for future improvement, proceed without fixing +``` + +**Severity-Based Prioritization:** + +``` +Issue List (sorted by severity): + +CRITICAL Issues (must fix all before proceeding): + 1. SQL injection in user search endpoint + 2. Missing authentication check on admin routes + 3. Password stored in plaintext + +HIGH Issues (fix before code review): + 4. Memory leak in WebSocket connection + 5. Missing error handling in payment flow + 6. Accessibility: keyboard navigation broken + +MEDIUM Issues (fix if time permits): + 7. Code duplication in auth controllers + 8. Inconsistent error messages + 9. Missing JSDoc comments + +LOW Issues (defer to future): + 10. Variable naming inconsistency + 11. Redundant type annotations + 12. CSS could use more specificity + +Action Plan: + - Fix CRITICAL (1-3) immediately → Re-run tests + - Fix HIGH (4-6) before code review + - Log MEDIUM (7-9) for next iteration + - Ignore LOW (10-12) for now +``` + +**Severity Escalation:** + +Issues can escalate in severity based on context: + +``` +Context-Based Escalation: + +Issue: "Missing error handling in payment flow" + Base Severity: MEDIUM (code quality issue) + + Context 1: Development environment + → Severity: MEDIUM (not user-facing yet) + + Context 2: Production environment + → Severity: HIGH (affects real users, money involved) + + Context 3: Production + recent payment failures + → Severity: CRITICAL (actively causing issues) + +Rule: Escalate severity when: + - Issue affects production users + - Issue involves money/security/data + - Issue is currently causing failures +``` + +--- + +### Pattern 4: Multi-Reviewer Consensus + +**Consensus Levels:** + +When multiple reviewers evaluate the same code/design: + +``` +UNANIMOUS (100% agreement): + - ALL reviewers flagged this issue + - VERY HIGH confidence + - Highest priority (likely a real problem) + +Example: + 3/3 reviewers: "SQL injection in search endpoint" + → UNANIMOUS consensus + → CRITICAL priority (all agree it's critical) + +STRONG CONSENSUS (67-99% agreement): + - MOST reviewers flagged this issue + - HIGH confidence + - High priority (probably a real problem) + +Example: + 2/3 reviewers: "Missing input validation" + → STRONG consensus (67%) + → HIGH priority + +MAJORITY (50-66% agreement): + - HALF or more flagged this issue + - MEDIUM confidence + - Medium priority (worth investigating) + +Example: + 2/3 reviewers: "Code duplication in controllers" + → MAJORITY consensus (67%) + → MEDIUM priority + +DIVERGENT (< 50% agreement): + - Only 1-2 reviewers flagged this issue + - LOW confidence + - Low priority (may be model-specific or false positive) + +Example: + 1/3 reviewers: "Variable naming could be better" + → DIVERGENT (33%) + → LOW priority (one reviewer's opinion) +``` + +**Consensus-Based Prioritization:** + +``` +Prioritized Issue List (by consensus + severity): + +1. [UNANIMOUS - CRITICAL] SQL injection in search + ALL reviewers agree: Claude, Grok, Gemini (3/3) + +2. [UNANIMOUS - HIGH] Missing input validation + ALL reviewers agree: Claude, Grok, Gemini (3/3) + +3. [STRONG - HIGH] Memory leak in WebSocket + MOST reviewers agree: Claude, Grok (2/3) + +4. [MAJORITY - MEDIUM] Code duplication + HALF+ reviewers agree: Claude, Gemini (2/3) + +5. [DIVERGENT - LOW] Variable naming + SINGLE reviewer: Claude only (1/3) + +Action: + - Fix issues 1-2 immediately (unanimous + CRITICAL/HIGH) + - Fix issue 3 before review (strong consensus) + - Consider issue 4 (majority, but medium severity) + - Ignore issue 5 (divergent, likely false positive) +``` + +--- + +### Pattern 5: Feedback Loop Implementation + +**User Feedback Loop:** + +``` +Workflow: User Validation with Feedback + +Step 1: Initial Implementation + Developer implements feature + Designer/Tester validates + Present to user for manual validation + +Step 2: User Validation Gate (MANDATORY) + Present to user: + "Implementation complete. Please manually verify: + - Open app at http://localhost:3000 + - Test feature: [specific instructions] + - Compare to design reference + + Does it meet expectations? (Yes/No)" + +Step 3a: User says YES + → ✅ Feature approved + → Generate final report + → Mark workflow complete + +Step 3b: User says NO + → Collect specific feedback + +Step 4: Collect Specific Feedback + Ask user: "Please describe the issues you found:" + + User response: + "1. Button color is wrong (should be blue, not green) + 2. Spacing is too tight between elements + 3. Font size is too small" + +Step 5: Extract Structured Feedback + Parse user feedback into structured issues: + + Issue 1: + Component: Button + Problem: Color incorrect + Expected: Blue (#2563EB) + Actual: Green (#10B981) + Severity: MEDIUM + + Issue 2: + Component: Container + Problem: Spacing too tight + Expected: 16px + Actual: 8px + Severity: MEDIUM + + Issue 3: + Component: Text + Problem: Font size too small + Expected: 16px + Actual: 14px + Severity: LOW + +Step 6: Launch Fixing Agent + Task: ui-developer + Prompt: "Fix user-reported issues: + + 1. Button color: Change from #10B981 to #2563EB + 2. Container spacing: Increase from 8px to 16px + 3. Text font size: Increase from 14px to 16px + + User feedback: [user's exact words]" + +Step 7: Re-validate + After fixes: + - Re-run designer validation + - Loop back to Step 2 (user validation) + +Step 8: Max Feedback Rounds + Limit: 5 feedback rounds (prevent infinite loop) + + If round > 5: + Escalate to human review + "Unable to meet user expectations after 5 rounds. + Manual intervention required." +``` + +**Feedback Round Tracking:** + +``` +Feedback Round History: + +Round 1: + User Issues: Button color, spacing, font size + Fixes Applied: Updated all 3 issues + Result: Re-validate + +Round 2: + User Issues: Border radius too large + Fixes Applied: Reduced border radius + Result: Re-validate + +Round 3: + User Issues: None + Result: ✅ APPROVED + +Total Rounds: 3/5 +``` + +--- + +### Pattern 6: Test-Driven Development Loop + +**When to Use:** + +Use TDD loop **after implementing code, before code review**: + +``` +Workflow Phases: + +Phase 1: Architecture Planning +Phase 2: Implementation +Phase 2.5: Test-Driven Development Loop ← THIS PATTERN +Phase 3: Code Review +Phase 4: User Acceptance +``` + +**The TDD Loop Pattern:** + +``` +Step 1: Write Tests First + Task: test-architect + Prompt: "Write comprehensive tests for authentication feature. + Requirements: [link to requirements] + Implementation: [link to code]" + Output: tests/auth.test.ts + +Step 2: Run Tests + Bash: bun test tests/auth.test.ts + Capture output and exit code + +Step 3: Check Test Results + If all tests pass: + → ✅ TDD loop complete + → Proceed to code review (Phase 3) + + If tests fail: + → Analyze failure (continue to Step 4) + +Step 4: Analyze Test Failure + Task: test-architect + Prompt: "Analyze test failure output: + + [test failure logs] + + Determine root cause: + - TEST_ISSUE: Test has bug (bad assertion, missing mock, wrong expectation) + - IMPLEMENTATION_ISSUE: Code has bug (logic error, missing validation, incorrect behavior) + + Provide detailed analysis." + + test-architect returns: + verdict: TEST_ISSUE | IMPLEMENTATION_ISSUE + analysis: Detailed explanation + recommendation: Specific fix needed + +Step 5a: If TEST_ISSUE (test is wrong) + Task: test-architect + Prompt: "Fix test based on analysis: + [analysis from Step 4]" + + After fix: + → Re-run tests (back to Step 2) + → Loop continues + +Step 5b: If IMPLEMENTATION_ISSUE (code is wrong) + Provide structured feedback to developer: + + Task: backend-developer + Prompt: "Fix implementation based on test failure: + + Test Failure: + [failure output] + + Root Cause: + [analysis from test-architect] + + Recommended Fix: + [specific fix needed]" + + After fix: + → Re-run tests (back to Step 2) + → Loop continues + +Step 6: Max Iteration Limit + Limit: 10 iterations + + Iteration tracking: + Iteration 1/10: 5 tests failed → Fix implementation + Iteration 2/10: 2 tests failed → Fix test (bad mock) + Iteration 3/10: All tests pass ✅ + + If iteration > 10: + Escalate to human review + "Unable to pass all tests after 10 iterations. + Manual debugging required." +``` + +**Example TDD Loop:** + +``` +Phase 2.5: Test-Driven Development Loop + +Iteration 1: + Tests Run: 20 tests + Results: 5 failed, 15 passed + Failure: "JWT token validation fails with expired token" + Analysis: IMPLEMENTATION_ISSUE - Missing expiration check + Fix: Added expiration validation in TokenService + Re-run: Continue to Iteration 2 + +Iteration 2: + Tests Run: 20 tests + Results: 2 failed, 18 passed + Failure: "Mock database not reset between tests" + Analysis: TEST_ISSUE - Missing beforeEach cleanup + Fix: Added database reset in test setup + Re-run: Continue to Iteration 3 + +Iteration 3: + Tests Run: 20 tests + Results: All passed ✅ + Result: TDD loop complete, proceed to code review + +Total Iterations: 3/10 +Duration: ~5 minutes +Benefits: + - Caught 2 bugs before code review + - Fixed 1 test quality issue + - All tests passing gives confidence in implementation +``` + +**Benefits of TDD Loop:** + +``` +Benefits: + +1. Catch bugs early (before code review, not after) +2. Ensure test quality (test-architect fixes bad tests) +3. Automated quality assurance (no manual testing needed) +4. Fast feedback loop (seconds to run tests, not minutes) +5. Confidence in implementation (all tests passing) + +Performance: + Traditional: Implement → Review → Find bugs → Fix → Re-review + Time: 30+ minutes, multiple review rounds + + TDD Loop: Implement → Test → Fix → Test → Review (with confidence) + Time: 15 minutes, single review round (fewer issues) +``` + +--- + +## Integration with Other Skills + +**quality-gates + multi-model-validation:** + +``` +Use Case: Cost approval before multi-model review + +Step 1: Estimate costs (multi-model-validation) +Step 2: User approval gate (quality-gates) + If approved: Proceed with parallel execution + If rejected: Offer alternatives +Step 3: Execute review (multi-model-validation) +``` + +**quality-gates + multi-agent-coordination:** + +``` +Use Case: Iteration loop with designer validation + +Step 1: Agent selection (multi-agent-coordination) + Select designer + ui-developer + +Step 2: Iteration loop (quality-gates) + For i = 1 to 10: + - Run designer validation + - If PASS: Exit loop + - Else: Delegate to ui-developer for fixes + +Step 3: User validation gate (quality-gates) + Mandatory manual approval +``` + +**quality-gates + error-recovery:** + +``` +Use Case: Test-driven loop with error recovery + +Step 1: Run tests (quality-gates TDD pattern) +Step 2: If test execution fails (error-recovery) + - Syntax error → Fix and retry + - Framework crash → Notify user, skip TDD +Step 3: If tests pass (quality-gates) + - Proceed to code review +``` + +--- + +## Best Practices + +**Do:** +- ✅ Set max iteration limits (prevent infinite loops) +- ✅ Define clear exit criteria (PASS, max iterations, user override) +- ✅ Track iteration history (document what happened) +- ✅ Show progress to user ("Iteration 3/10 complete") +- ✅ Classify issue severity (CRITICAL → HIGH → MEDIUM → LOW) +- ✅ Prioritize by consensus + severity +- ✅ Ask user approval for expensive operations +- ✅ Collect specific feedback (not vague complaints) +- ✅ Use TDD loop to catch bugs early + +**Don't:** +- ❌ Create infinite loops (no exit criteria) +- ❌ Skip user validation gates (mandatory for UX) +- ❌ Ignore consensus (unanimous issues are real) +- ❌ Batch all severities together (prioritize CRITICAL) +- ❌ Proceed without approval for >$0.01 operations +- ❌ Collect vague feedback ("it's wrong" → what specifically?) +- ❌ Skip TDD loop (catches bugs before expensive review) + +**Performance:** +- Iteration loops: 5-10 iterations typical, max 10-15 min +- TDD loop: 3-5 iterations typical, max 5-10 min +- User feedback: 1-3 rounds typical, max 5 rounds + +--- + +## Examples + +### Example 1: User Approval Gate for Multi-Model Review + +**Scenario:** User requests multi-model review, costs $0.008 + +**Execution:** + +``` +Step 1: Estimate Costs + Input: 450 lines × 1.5 = 675 tokens per model + Output: 2000-4000 tokens per model + Total: 3 models × 3000 avg = 9000 output tokens + Cost: ~$0.008 ($0.005 - $0.010) + +Step 2: Present Approval Gate + "Multi-model review will analyze 450 lines with 3 AI models: + - Claude Sonnet (embedded, free) + - Grok Code Fast (external, $0.002) + - Gemini 2.5 Flash (external, $0.001) + + Estimated cost: $0.008 ($0.005 - $0.010) + Duration: ~5 minutes + + Proceed? (Yes/No/Cancel)" + +Step 3a: User says YES + → Proceed with parallel execution + → Track approval: log("User approved $0.008 cost") + +Step 3b: User says NO + → Offer alternatives: + 1. Use only free Claude (no external models) + 2. Use only 1 external model (reduce cost to $0.002) + 3. Skip review entirely + → Ask user to choose + +Step 3c: User says CANCEL + → Exit gracefully + → Log: "User cancelled multi-model review" + → Clean up temporary files +``` + +--- + +### Example 2: Designer Validation Iteration Loop + +**Scenario:** UI implementation with automated iteration until PASS + +**Execution:** + +``` +Iteration 1: + Task: designer + Prompt: "Validate navbar against Figma design" + Output: ai-docs/design-review-1.md + Assessment: NEEDS IMPROVEMENT + Issues: + - Button color: #3B82F6 (expected #2563EB) + - Spacing: 8px (expected 16px) + + Task: ui-developer + Prompt: "Fix issues from ai-docs/design-review-1.md" + Changes: Updated button color, increased spacing + + Result: Continue to Iteration 2 + +Iteration 2: + Task: designer + Prompt: "Re-validate navbar" + Output: ai-docs/design-review-2.md + Assessment: NEEDS IMPROVEMENT + Issues: + - Border radius: 8px (expected 4px) + + Task: ui-developer + Prompt: "Fix border radius issue" + Changes: Reduced border radius to 4px + + Result: Continue to Iteration 3 + +Iteration 3: + Task: designer + Prompt: "Re-validate navbar" + Output: ai-docs/design-review-3.md + Assessment: PASS ✓ + Issues: None + + Result: Exit loop (success) + +Summary: + Total Iterations: 3/10 + Duration: ~8 minutes + Automated Fixes: 3 issues resolved + Result: PASS, proceed to user validation +``` + +--- + +### Example 3: Test-Driven Development Loop + +**Scenario:** Authentication implementation with TDD + +**Execution:** + +``` +Phase 2.5: Test-Driven Development Loop + +Iteration 1: + Task: test-architect + Prompt: "Write tests for authentication feature" + Output: tests/auth.test.ts (20 tests) + + Bash: bun test tests/auth.test.ts + Result: 5 failed, 15 passed + + Task: test-architect + Prompt: "Analyze test failures" + Verdict: IMPLEMENTATION_ISSUE + Analysis: "Missing JWT expiration validation" + + Task: backend-developer + Prompt: "Add JWT expiration validation" + Changes: Updated TokenService.verify() + + Bash: bun test tests/auth.test.ts + Result: Continue to Iteration 2 + +Iteration 2: + Bash: bun test tests/auth.test.ts + Result: 2 failed, 18 passed + + Task: test-architect + Prompt: "Analyze test failures" + Verdict: TEST_ISSUE + Analysis: "Mock database not reset between tests" + + Task: test-architect + Prompt: "Fix test setup" + Changes: Added beforeEach cleanup + + Bash: bun test tests/auth.test.ts + Result: Continue to Iteration 3 + +Iteration 3: + Bash: bun test tests/auth.test.ts + Result: All 20 passed ✅ + + Result: TDD loop complete, proceed to code review + +Summary: + Total Iterations: 3/10 + Duration: ~5 minutes + Bugs Caught: 1 implementation bug, 1 test bug + Result: All tests passing, high confidence in code +``` + +--- + +## Troubleshooting + +**Problem: Infinite iteration loop** + +Cause: No exit criteria or max iteration limit + +Solution: Always set max iterations (10 for automated, 5 for user feedback) + +``` +❌ Wrong: + while (true) { + if (review.assessment === "PASS") break; + fix(); + } + +✅ Correct: + for (let i = 1; i <= 10; i++) { + if (review.assessment === "PASS") break; + if (i === 10) escalateToUser(); + fix(); + } +``` + +--- + +**Problem: User approval skipped for expensive operation** + +Cause: Missing approval gate + +Solution: Always ask approval for costs >$0.01 + +``` +❌ Wrong: + if (userRequestedMultiModel) { + executeReview(); + } + +✅ Correct: + if (userRequestedMultiModel) { + const cost = estimateCost(); + if (cost > 0.01) { + const approved = await askUserApproval(cost); + if (!approved) return offerAlternatives(); + } + executeReview(); + } +``` + +--- + +**Problem: All issues treated equally** + +Cause: No severity classification + +Solution: Classify by severity, prioritize CRITICAL + +``` +❌ Wrong: + issues.forEach(issue => fix(issue)); + +✅ Correct: + const critical = issues.filter(i => i.severity === "CRITICAL"); + const high = issues.filter(i => i.severity === "HIGH"); + + critical.forEach(issue => fix(issue)); // Fix critical first + high.forEach(issue => fix(issue)); // Then high + // MEDIUM and LOW deferred or skipped +``` + +--- + +## Summary + +Quality gates ensure high-quality results through: + +- **User approval gates** (cost, quality, final validation) +- **Iteration loops** (automated refinement, max 10 iterations) +- **Severity classification** (CRITICAL → HIGH → MEDIUM → LOW) +- **Consensus prioritization** (unanimous → strong → majority → divergent) +- **Feedback loops** (collect specific issues, fix, re-validate) +- **Test-driven development** (write tests, run, fix, repeat until pass) + +Master these patterns and your workflows will consistently produce high-quality, validated results. + +--- + +**Extracted From:** +- `/review` command (user approval for costs, consensus analysis) +- `/validate-ui` command (iteration loops, user validation gates, feedback collection) +- `/implement` command (PHASE 2.5 test-driven development loop) +- Multi-model review patterns (consensus-based prioritization) diff --git a/skills/todowrite-orchestration/SKILL.md b/skills/todowrite-orchestration/SKILL.md new file mode 100644 index 0000000..c583b2c --- /dev/null +++ b/skills/todowrite-orchestration/SKILL.md @@ -0,0 +1,983 @@ +--- +name: todowrite-orchestration +description: Track progress in multi-phase workflows with TodoWrite. Use when orchestrating 5+ phase commands, managing iteration loops, tracking parallel tasks, or providing real-time progress visibility. Trigger keywords - "phase tracking", "progress", "workflow", "multi-step", "multi-phase", "todo", "tracking", "status". +version: 0.1.0 +tags: [orchestration, todowrite, progress, tracking, workflow, multi-phase] +keywords: [phase-tracking, progress, workflow, multi-step, multi-phase, todo, tracking, status, visibility] +--- + +# TodoWrite Orchestration + +**Version:** 1.0.0 +**Purpose:** Patterns for using TodoWrite in complex multi-phase workflows +**Status:** Production Ready + +## Overview + +TodoWrite orchestration is the practice of using the TodoWrite tool to provide **real-time progress visibility** in complex multi-phase workflows. It transforms opaque "black box" workflows into transparent, trackable processes where users can see: + +- What phase is currently executing +- How many phases remain +- Which tasks are pending, in-progress, or completed +- Overall progress percentage +- Iteration counts in loops + +This skill provides battle-tested patterns for: +- **Phase initialization** (create complete task list before starting) +- **Task granularity** (how to break phases into trackable tasks) +- **Status transitions** (pending → in_progress → completed) +- **Real-time updates** (mark complete immediately, not batched) +- **Iteration tracking** (progress through loops) +- **Parallel task tracking** (multiple agents executing simultaneously) + +TodoWrite orchestration is especially valuable for workflows with >5 phases or >10 minutes duration, where users need progress feedback. + +## Core Patterns + +### Pattern 1: Phase Initialization + +**Create TodoWrite List BEFORE Starting:** + +Initialize TodoWrite as **step 0** of your workflow, before any actual work begins: + +``` +✅ CORRECT - Initialize First: + +Step 0: Initialize TodoWrite + TodoWrite: Create task list + - PHASE 1: Gather user inputs + - PHASE 1: Validate inputs + - PHASE 2: Select AI models + - PHASE 2: Estimate costs + - PHASE 2: Get user approval + - PHASE 3: Launch parallel reviews + - PHASE 3: Wait for all reviews + - PHASE 4: Consolidate reviews + - PHASE 5: Present results + +Step 1: Start actual work (PHASE 1) + Mark "PHASE 1: Gather user inputs" as in_progress + ... do work ... + Mark "PHASE 1: Gather user inputs" as completed + Mark "PHASE 1: Validate inputs" as in_progress + ... do work ... + +❌ WRONG - Create During Workflow: + +Step 1: Do some work + ... work happens ... + TodoWrite: Create task "Did some work" (completed) + +Step 2: Do more work + ... work happens ... + TodoWrite: Create task "Did more work" (completed) + +Problem: User has no visibility into upcoming phases +``` + +**List All Phases Upfront:** + +When initializing, include **all phases** in the task list, not just the current phase: + +``` +✅ CORRECT - Complete Visibility: + +TodoWrite Initial State: + [ ] PHASE 1: Gather user inputs + [ ] PHASE 1: Validate inputs + [ ] PHASE 2: Architecture planning + [ ] PHASE 3: Implementation + [ ] PHASE 3: Run quality checks + [ ] PHASE 4: Code review + [ ] PHASE 5: User acceptance + [ ] PHASE 6: Generate report + +User sees: "8 tasks total, 0 complete, Phase 1 starting" + +❌ WRONG - Incremental Discovery: + +TodoWrite Initial State: + [ ] PHASE 1: Gather user inputs + [ ] PHASE 1: Validate inputs + +(User thinks workflow is 2 tasks, then surprised by 6 more phases) +``` + +**Why Initialize First:** + +1. **User expectation setting:** User knows workflow scope (8 phases, ~20 minutes) +2. **Progress visibility:** User can see % complete (3/8 = 37.5%) +3. **Time estimation:** User can estimate remaining time based on progress +4. **Transparency:** No hidden phases or surprises + +--- + +### Pattern 2: Task Granularity Guidelines + +**One Task Per Significant Operation:** + +Each task should represent a **significant operation** (1-5 minutes of work): + +``` +✅ CORRECT - Significant Operations: + +Tasks: + - PHASE 1: Ask user for inputs (30s) + - PHASE 2: Generate architecture plan (2 min) + - PHASE 3: Implement feature (5 min) + - PHASE 4: Run tests (1 min) + - PHASE 5: Code review (3 min) + +Each task = meaningful unit of work + +❌ WRONG - Too Granular: + +Tasks: + - PHASE 1: Ask user question 1 + - PHASE 1: Ask user question 2 + - PHASE 1: Ask user question 3 + - PHASE 2: Read file A + - PHASE 2: Read file B + - PHASE 2: Write file C + - ... (50 micro-tasks) + +Problem: Too many updates, clutters user interface +``` + +**Multi-Step Phases: Break Into 2-3 Sub-Tasks:** + +For complex phases (>5 minutes), break into 2-3 sub-tasks: + +``` +✅ CORRECT - Sub-Task Breakdown: + +PHASE 3: Implementation (15 min total) + → Sub-tasks: + - PHASE 3: Implement core logic (5 min) + - PHASE 3: Add error handling (3 min) + - PHASE 3: Write tests (7 min) + +User sees progress within phase: "PHASE 3: 2/3 complete" + +❌ WRONG - Single Monolithic Task: + +PHASE 3: Implementation (15 min) + → No sub-tasks + +Problem: User sees "in_progress" for 15 min with no updates +``` + +**Avoid Too Many Tasks:** + +Limit to **max 15-20 tasks** for readability: + +``` +✅ CORRECT - 12 Tasks (readable): + +10-phase workflow: + - PHASE 1: Ask user + - PHASE 2: Plan (2 sub-tasks) + - PHASE 3: Implement (3 sub-tasks) + - PHASE 4: Test + - PHASE 5: Review (2 sub-tasks) + - PHASE 6: Fix issues + - PHASE 7: Re-review + - PHASE 8: Accept + +Total: 12 tasks (clean, trackable) + +❌ WRONG - 50 Tasks (overwhelming): + +Every single action as separate task: + - Read file 1 + - Read file 2 + - Write file 3 + - Run command 1 + - ... (50 tasks) + +Problem: User overwhelmed, can't see forest for trees +``` + +**Guideline by Workflow Duration:** + +``` +Workflow Duration → Task Count: + +< 5 minutes: 3-5 tasks +5-15 minutes: 8-12 tasks +15-30 minutes: 12-18 tasks +> 30 minutes: 15-20 tasks (if more, group into phases) + +Example: + 5-minute workflow (3 phases): + - PHASE 1: Prepare + - PHASE 2: Execute + - PHASE 3: Present + Total: 3 tasks ✓ + + 20-minute workflow (6 phases): + - PHASE 1: Ask user + - PHASE 2: Plan (2 sub-tasks) + - PHASE 3: Implement (3 sub-tasks) + - PHASE 4: Test + - PHASE 5: Review (2 sub-tasks) + - PHASE 6: Accept + Total: 11 tasks ✓ +``` + +--- + +### Pattern 3: Status Transitions + +**Exactly ONE Task In Progress at a Time:** + +Maintain the invariant: **exactly one task in_progress** at any moment: + +``` +✅ CORRECT - One In-Progress: + +State at time T1: + [✓] PHASE 1: Ask user (completed) + [✓] PHASE 2: Plan (completed) + [→] PHASE 3: Implement (in_progress) ← Only one + [ ] PHASE 4: Test (pending) + [ ] PHASE 5: Review (pending) + +State at time T2 (after PHASE 3 completes): + [✓] PHASE 1: Ask user (completed) + [✓] PHASE 2: Plan (completed) + [✓] PHASE 3: Implement (completed) + [→] PHASE 4: Test (in_progress) ← Only one + [ ] PHASE 5: Review (pending) + +❌ WRONG - Multiple In-Progress: + +State: + [✓] PHASE 1: Ask user (completed) + [→] PHASE 2: Plan (in_progress) ← Two in-progress? + [→] PHASE 3: Implement (in_progress) ← Confusing! + [ ] PHASE 4: Test (pending) + +Problem: User confused about current phase +``` + +**Status Transition Sequence:** + +``` +Lifecycle of a Task: + +1. Created: pending + (Task exists, not started yet) + +2. Started: pending → in_progress + (Mark as in_progress when starting work) + +3. Completed: in_progress → completed + (Mark as completed immediately after finishing) + +4. Next task: Mark next task as in_progress + (Continue to next task) + +Example Timeline: + +T=0s: [→] Task 1 (in_progress), [ ] Task 2 (pending) +T=30s: [✓] Task 1 (completed), [→] Task 2 (in_progress) +T=60s: [✓] Task 1 (completed), [✓] Task 2 (completed) +``` + +**NEVER Batch Completions:** + +Mark tasks completed **immediately** after finishing, not at end of phase: + +``` +✅ CORRECT - Immediate Updates: + +Mark "PHASE 1: Ask user" as in_progress +... do work (30s) ... +Mark "PHASE 1: Ask user" as completed ← Immediate + +Mark "PHASE 1: Validate inputs" as in_progress +... do work (20s) ... +Mark "PHASE 1: Validate inputs" as completed ← Immediate + +User sees real-time progress + +❌ WRONG - Batched Updates: + +Mark "PHASE 1: Ask user" as in_progress +... do work (30s) ... + +Mark "PHASE 1: Validate inputs" as in_progress +... do work (20s) ... + +(At end of PHASE 1, batch update both to completed) + +Problem: User doesn't see progress for 50s, thinks workflow is stuck +``` + +--- + +### Pattern 4: Real-Time Progress Tracking + +**Update TodoWrite As Work Progresses:** + +TodoWrite should reflect **current state**, not past state: + +``` +✅ CORRECT - Real-Time Updates: + +T=0s: Initialize TodoWrite (8 tasks, all pending) +T=5s: Mark "PHASE 1" as in_progress +T=35s: Mark "PHASE 1" as completed, "PHASE 2" as in_progress +T=90s: Mark "PHASE 2" as completed, "PHASE 3" as in_progress +... + +User always sees accurate current state + +❌ WRONG - Delayed Updates: + +T=0s: Initialize TodoWrite +T=300s: Workflow completes +T=301s: Update all tasks to completed + +Problem: No progress visibility for 5 minutes +``` + +**Add New Tasks If Discovered During Execution:** + +If you discover additional work during execution, add new tasks: + +``` +Scenario: During implementation, realize refactoring needed + +Initial TodoWrite: + [✓] PHASE 1: Plan + [→] PHASE 2: Implement + [ ] PHASE 3: Test + [ ] PHASE 4: Review + +During PHASE 2, discover: + "Implementation requires refactoring legacy code" + +Updated TodoWrite: + [✓] PHASE 1: Plan + [✓] PHASE 2: Implement core logic (completed) + [→] PHASE 2: Refactor legacy code (in_progress) ← New task added + [ ] PHASE 3: Test + [ ] PHASE 4: Review + +User sees: "Additional work discovered: refactoring. Total now 5 tasks." +``` + +**User Can See Current Progress at Any Time:** + +With real-time updates, user can check progress: + +``` +User checks at T=120s: + +TodoWrite State: + [✓] PHASE 1: Ask user + [✓] PHASE 2: Plan architecture + [→] PHASE 3: Implement core logic (in_progress) + [ ] PHASE 3: Add error handling + [ ] PHASE 3: Write tests + [ ] PHASE 4: Code review + [ ] PHASE 5: Accept + +User sees: "3/8 tasks complete (37.5%), currently implementing core logic" +``` + +--- + +### Pattern 5: Iteration Loop Tracking + +**Create Task Per Iteration:** + +For iteration loops, create a task for each iteration: + +``` +✅ CORRECT - Iteration Tasks: + +Design Validation Loop (max 10 iterations): + +Initial TodoWrite: + [ ] Iteration 1/10: Designer validation + [ ] Iteration 2/10: Designer validation + [ ] Iteration 3/10: Designer validation + ... (create all 10 upfront) + +Progress: + [✓] Iteration 1/10: Designer validation (NEEDS IMPROVEMENT) + [✓] Iteration 2/10: Designer validation (NEEDS IMPROVEMENT) + [→] Iteration 3/10: Designer validation (in_progress) + [ ] Iteration 4/10: Designer validation + ... + +User sees: "Iteration 3/10 in progress, 2 complete" + +❌ WRONG - Single Loop Task: + +TodoWrite: + [→] Design validation loop (in_progress) + +Problem: User sees "in_progress" for 10 minutes, no iteration visibility +``` + +**Mark Iteration Complete When Done:** + +``` +Iteration Lifecycle: + +Iteration 1: + Mark "Iteration 1/10" as in_progress + Run designer validation + If NEEDS IMPROVEMENT: Run developer fixes + Mark "Iteration 1/10" as completed + +Iteration 2: + Mark "Iteration 2/10" as in_progress + Run designer validation + If PASS: Exit loop early + Mark "Iteration 2/10" as completed + +Result: Loop exited after 2 iterations + [✓] Iteration 1/10 (completed) + [✓] Iteration 2/10 (completed) + [ ] Iteration 3/10 (not needed, loop exited) + ... + +User sees: "Loop completed in 2/10 iterations" +``` + +**Track Total Iterations vs Max Limit:** + +``` +Iteration Progress: + +Max: 10 iterations +Current: 5 + +TodoWrite State: + [✓] Iteration 1/10 + [✓] Iteration 2/10 + [✓] Iteration 3/10 + [✓] Iteration 4/10 + [→] Iteration 5/10 + [ ] Iteration 6/10 + ... + +User sees: "Iteration 5/10 (50% through max)" + +Warning at Iteration 8: + "Iteration 8/10 - approaching max, may escalate to user if not PASS" +``` + +**Clear Progress Visibility:** + +``` +Iteration Loop with TodoWrite: + +User Request: "Validate UI design" + +TodoWrite: + [✓] PHASE 1: Gather design reference + [✓] Iteration 1/10: Designer validation (5 issues found) + [✓] Iteration 2/10: Designer validation (3 issues found) + [✓] Iteration 3/10: Designer validation (1 issue found) + [→] Iteration 4/10: Designer validation (in_progress) + [ ] Iteration 5/10: Designer validation + ... + [ ] PHASE 3: User validation gate + +User sees: + - 4 iterations completed (40% through max) + - Issues reducing each iteration (5 → 3 → 1) + - Progress toward PASS +``` + +--- + +### Pattern 6: Parallel Task Tracking + +**Multiple Agents Executing Simultaneously:** + +When running agents in parallel, track each separately: + +``` +✅ CORRECT - Separate Tasks for Parallel Agents: + +Multi-Model Review (3 models in parallel): + +TodoWrite: + [✓] PHASE 1: Prepare review context + [→] PHASE 2: Claude review (in_progress) + [→] PHASE 2: Grok review (in_progress) + [→] PHASE 2: Gemini review (in_progress) + [ ] PHASE 3: Consolidate reviews + +Note: 3 tasks "in_progress" is OK for parallel execution + (Exception to "one in_progress" rule) + +As models complete: + [✓] PHASE 1: Prepare review context + [✓] PHASE 2: Claude review (completed) ← First to finish + [→] PHASE 2: Grok review (in_progress) + [→] PHASE 2: Gemini review (in_progress) + [ ] PHASE 3: Consolidate reviews + +User sees: "1/3 reviews complete, 2 in progress" + +❌ WRONG - Single Task for Parallel Work: + +TodoWrite: + [✓] PHASE 1: Prepare + [→] PHASE 2: Run 3 reviews (in_progress) + [ ] PHASE 3: Consolidate + +Problem: No visibility into which reviews are complete +``` + +**Update As Each Agent Completes:** + +``` +Parallel Execution Timeline: + +T=0s: Launch 3 reviews in parallel + [→] Claude review (in_progress) + [→] Grok review (in_progress) + [→] Gemini review (in_progress) + +T=60s: Claude completes first + [✓] Claude review (completed) + [→] Grok review (in_progress) + [→] Gemini review (in_progress) + +T=120s: Gemini completes + [✓] Claude review (completed) + [→] Grok review (in_progress) + [✓] Gemini review (completed) + +T=180s: Grok completes + [✓] Claude review (completed) + [✓] Grok review (completed) + [✓] Gemini review (completed) + +User sees real-time completion updates +``` + +**Progress Indicators During Long Parallel Tasks:** + +``` +For long-running parallel tasks (>2 minutes), show progress: + +T=0s: "Launching 5 AI model reviews (estimated 5 minutes)..." +T=60s: "1/5 reviews complete..." +T=120s: "2/5 reviews complete..." +T=180s: "4/5 reviews complete, 1 in progress..." +T=240s: "All reviews complete! Consolidating results..." + +TodoWrite mirrors this: + [✓] Claude review (1/5 complete) + [✓] Grok review (2/5 complete) + [→] Gemini review (in_progress) + [→] GPT-5 review (in_progress) + [→] DeepSeek review (in_progress) +``` + +--- + +## Integration with Other Skills + +**todowrite-orchestration + multi-agent-coordination:** + +``` +Use Case: Multi-phase implementation workflow + +Step 1: Initialize TodoWrite (todowrite-orchestration) + Create task list for all 8 phases + +Step 2: Sequential Agent Delegation (multi-agent-coordination) + Phase 1: api-architect + Mark PHASE 1 as in_progress + Delegate to api-architect + Mark PHASE 1 as completed + + Phase 2: backend-developer + Mark PHASE 2 as in_progress + Delegate to backend-developer + Mark PHASE 2 as completed + + ... continue for all phases +``` + +**todowrite-orchestration + multi-model-validation:** + +``` +Use Case: Multi-model review with progress tracking + +Step 1: Initialize TodoWrite (todowrite-orchestration) + [ ] PHASE 1: Prepare context + [ ] PHASE 2: Launch reviews (5 models) + [ ] PHASE 3: Consolidate results + +Step 2: Parallel Execution (multi-model-validation) + Mark "PHASE 2: Launch reviews" as in_progress + Launch all 5 models simultaneously + As each completes: Update progress (1/5, 2/5, ...) + Mark "PHASE 2: Launch reviews" as completed + +Step 3: Real-Time Visibility (todowrite-orchestration) + User sees: "PHASE 2: 3/5 reviews complete..." +``` + +**todowrite-orchestration + quality-gates:** + +``` +Use Case: Iteration loop with TodoWrite tracking + +Step 1: Initialize TodoWrite (todowrite-orchestration) + [ ] Iteration 1/10 + [ ] Iteration 2/10 + ... + +Step 2: Iteration Loop (quality-gates) + For i = 1 to 10: + Mark "Iteration i/10" as in_progress + Run designer validation + If PASS: Exit loop + Mark "Iteration i/10" as completed + +Step 3: Progress Visibility + User sees: "Iteration 5/10 complete, 5 remaining" +``` + +--- + +## Best Practices + +**Do:** +- ✅ Initialize TodoWrite BEFORE starting work (step 0) +- ✅ List ALL phases upfront (user sees complete scope) +- ✅ Use 8-15 tasks for typical workflows (readable) +- ✅ Mark completed IMMEDIATELY after finishing (real-time) +- ✅ Keep exactly ONE task in_progress (except parallel tasks) +- ✅ Track iterations separately (Iteration 1/10, 2/10, ...) +- ✅ Update as work progresses (not batched at end) +- ✅ Add new tasks if discovered during execution + +**Don't:** +- ❌ Create TodoWrite during workflow (initialize first) +- ❌ Hide phases from user (list all upfront) +- ❌ Create too many tasks (>20 overwhelms user) +- ❌ Batch completions at end of phase (update real-time) +- ❌ Leave multiple tasks in_progress (pick one) +- ❌ Use single task for loop (track iterations separately) +- ❌ Update only at start/end (update during execution) + +**Performance:** +- TodoWrite overhead: <1s per update (negligible) +- User visibility benefit: Reduces perceived wait time 30-50% +- Workflow confidence: User knows progress, less likely to cancel + +--- + +## Examples + +### Example 1: 8-Phase Implementation Workflow + +**Scenario:** Full-cycle implementation with TodoWrite tracking + +**Execution:** + +``` +Step 0: Initialize TodoWrite + TodoWrite: Create task list + [ ] PHASE 1: Ask user for requirements + [ ] PHASE 2: Generate architecture plan + [ ] PHASE 3: Implement core logic + [ ] PHASE 3: Add error handling + [ ] PHASE 3: Write tests + [ ] PHASE 4: Run test suite + [ ] PHASE 5: Code review + [ ] PHASE 6: Fix review issues + [ ] PHASE 7: User acceptance + [ ] PHASE 8: Generate report + + User sees: "10 tasks, 0 complete, Phase 1 starting..." + +Step 1: PHASE 1 + Mark "PHASE 1: Ask user" as in_progress + ... gather requirements (30s) ... + Mark "PHASE 1: Ask user" as completed + User sees: "1/10 tasks complete (10%)" + +Step 2: PHASE 2 + Mark "PHASE 2: Architecture plan" as in_progress + ... generate plan (2 min) ... + Mark "PHASE 2: Architecture plan" as completed + User sees: "2/10 tasks complete (20%)" + +Step 3: PHASE 3 (3 sub-tasks) + Mark "PHASE 3: Implement core" as in_progress + ... implement (3 min) ... + Mark "PHASE 3: Implement core" as completed + User sees: "3/10 tasks complete (30%)" + + Mark "PHASE 3: Add error handling" as in_progress + ... add error handling (2 min) ... + Mark "PHASE 3: Add error handling" as completed + User sees: "4/10 tasks complete (40%)" + + Mark "PHASE 3: Write tests" as in_progress + ... write tests (3 min) ... + Mark "PHASE 3: Write tests" as completed + User sees: "5/10 tasks complete (50%)" + +... continue through all phases ... + +Final State: + [✓] All 10 tasks completed + User sees: "10/10 tasks complete (100%). Workflow finished!" + +Total Duration: ~15 minutes +User Experience: Continuous progress updates every 1-3 minutes +``` + +--- + +### Example 2: Iteration Loop with Progress Tracking + +**Scenario:** Design validation with 10 max iterations + +**Execution:** + +``` +Step 0: Initialize TodoWrite + TodoWrite: Create task list + [ ] PHASE 1: Gather design reference + [ ] Iteration 1/10: Designer validation + [ ] Iteration 2/10: Designer validation + [ ] Iteration 3/10: Designer validation + [ ] Iteration 4/10: Designer validation + [ ] Iteration 5/10: Designer validation + ... (10 iterations total) + [ ] PHASE 3: User validation gate + +Step 1: PHASE 1 + Mark "PHASE 1: Gather design" as in_progress + ... gather design (20s) ... + Mark "PHASE 1: Gather design" as completed + +Step 2: Iteration Loop + Iteration 1: + Mark "Iteration 1/10" as in_progress + Designer: "NEEDS IMPROVEMENT - 5 issues" + Developer: Fix 5 issues + Mark "Iteration 1/10" as completed + User sees: "Iteration 1/10 complete, 5 issues fixed" + + Iteration 2: + Mark "Iteration 2/10" as in_progress + Designer: "NEEDS IMPROVEMENT - 3 issues" + Developer: Fix 3 issues + Mark "Iteration 2/10" as completed + User sees: "Iteration 2/10 complete, 3 issues fixed" + + Iteration 3: + Mark "Iteration 3/10" as in_progress + Designer: "NEEDS IMPROVEMENT - 1 issue" + Developer: Fix 1 issue + Mark "Iteration 3/10" as completed + User sees: "Iteration 3/10 complete, 1 issue fixed" + + Iteration 4: + Mark "Iteration 4/10" as in_progress + Designer: "PASS ✓" + Mark "Iteration 4/10" as completed + Exit loop (early exit) + User sees: "Loop completed in 4/10 iterations" + +Step 3: PHASE 3 + Mark "PHASE 3: User validation" as in_progress + ... user validates ... + Mark "PHASE 3: User validation" as completed + +Final State: + [✓] PHASE 1: Gather design + [✓] Iteration 1/10 (5 issues fixed) + [✓] Iteration 2/10 (3 issues fixed) + [✓] Iteration 3/10 (1 issue fixed) + [✓] Iteration 4/10 (PASS) + [ ] Iteration 5/10 (not needed) + ... + [✓] PHASE 3: User validation + +User Experience: Clear iteration progress, early exit visible +``` + +--- + +### Example 3: Parallel Multi-Model Review + +**Scenario:** 5 AI models reviewing code in parallel + +**Execution:** + +``` +Step 0: Initialize TodoWrite + TodoWrite: Create task list + [ ] PHASE 1: Prepare review context + [ ] PHASE 2: Claude review + [ ] PHASE 2: Grok review + [ ] PHASE 2: Gemini review + [ ] PHASE 2: GPT-5 review + [ ] PHASE 2: DeepSeek review + [ ] PHASE 3: Consolidate reviews + [ ] PHASE 4: Present results + +Step 1: PHASE 1 + Mark "PHASE 1: Prepare context" as in_progress + ... prepare (30s) ... + Mark "PHASE 1: Prepare context" as completed + +Step 2: PHASE 2 (Parallel Execution) + Mark all 5 reviews as in_progress: + [→] Claude review + [→] Grok review + [→] Gemini review + [→] GPT-5 review + [→] DeepSeek review + + Launch all 5 in parallel (4-Message Pattern) + + As each completes: + T=60s: Claude completes + [✓] Claude review + User sees: "1/5 reviews complete" + + T=90s: Gemini completes + [✓] Gemini review + User sees: "2/5 reviews complete" + + T=120s: GPT-5 completes + [✓] GPT-5 review + User sees: "3/5 reviews complete" + + T=150s: Grok completes + [✓] Grok review + User sees: "4/5 reviews complete" + + T=180s: DeepSeek completes + [✓] DeepSeek review + User sees: "5/5 reviews complete!" + +Step 3: PHASE 3 + Mark "PHASE 3: Consolidate" as in_progress + ... consolidate (30s) ... + Mark "PHASE 3: Consolidate" as completed + +Step 4: PHASE 4 + Mark "PHASE 4: Present results" as in_progress + ... present (10s) ... + Mark "PHASE 4: Present results" as completed + +Final State: + [✓] All 8 tasks completed + User sees: "Multi-model review complete in 3 minutes" + +User Experience: + - Real-time progress as each model completes + - Clear visibility: "3/5 reviews complete" + - Reduces perceived wait time (user knows progress) +``` + +--- + +## Troubleshooting + +**Problem: User thinks workflow is stuck** + +Cause: No TodoWrite updates for >1 minute + +Solution: Update TodoWrite more frequently, or add sub-tasks + +``` +❌ Wrong: + [→] PHASE 3: Implementation (in_progress for 10 minutes) + +✅ Correct: + [✓] PHASE 3: Implement core logic (2 min) + [✓] PHASE 3: Add error handling (3 min) + [→] PHASE 3: Write tests (in_progress, 2 min so far) + +User sees progress every 2-3 minutes +``` + +--- + +**Problem: Too many tasks (>20), overwhelming** + +Cause: Too granular task breakdown + +Solution: Group micro-tasks into larger operations + +``` +❌ Wrong (25 tasks): + [ ] Read file 1 + [ ] Read file 2 + [ ] Write file 3 + ... (25 micro-tasks) + +✅ Correct (8 tasks): + [ ] PHASE 1: Gather inputs (includes reading files) + [ ] PHASE 2: Process data + ... (8 significant operations) +``` + +--- + +**Problem: Multiple tasks "in_progress" (not parallel execution)** + +Cause: Forgot to mark previous task as completed + +Solution: Always mark completed before starting next + +``` +❌ Wrong: + [→] PHASE 1: Ask user (in_progress) + [→] PHASE 2: Plan (in_progress) ← Both in_progress? + +✅ Correct: + [✓] PHASE 1: Ask user (completed) + [→] PHASE 2: Plan (in_progress) ← Only one +``` + +--- + +## Summary + +TodoWrite orchestration provides real-time progress visibility through: + +- **Phase initialization** (create task list before starting) +- **Appropriate granularity** (8-15 tasks, significant operations) +- **Real-time updates** (mark completed immediately) +- **Exactly one in_progress** (except parallel execution) +- **Iteration tracking** (separate task per iteration) +- **Parallel task tracking** (update as each completes) + +Master these patterns and users will always know: +- What's happening now +- What's coming next +- How much progress has been made +- How much remains + +This transforms "black box" workflows into transparent, trackable processes. + +--- + +**Extracted From:** +- `/review` command (10-task initialization, phase-based tracking) +- `/implement` command (8-phase workflow with sub-tasks) +- `/validate-ui` command (iteration tracking, user feedback rounds) +- All multi-phase orchestration workflows