commit 46dfc308646da3d61b5a5cb15cd0dc79d8f28e2b Author: Zhongwei Li Date: Sat Nov 29 18:29:18 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..81ca581 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "incident-response", + "description": "Production incident response system with incident management workflows, intelligent debugging, runbook automation, and postmortem generation. Handles incident detection, triage, mitigation, and learning.", + "version": "1.0.0", + "author": { + "name": "Grey Haven Studio", + "url": "https://github.com/greyhaven-ai/claude-code-config" + }, + "skills": [ + "./skills/incident-response", + "./skills/smart-debugging" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5209548 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# incident-response + +Production incident response system with incident management workflows, intelligent debugging, runbook automation, and postmortem generation. Handles incident detection, triage, mitigation, and learning. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..134f48e --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,129 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:greyhaven-ai/claude-code-config:grey-haven-plugins/incident-response", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "e98139fea614ab40d9a45184b58fb08f0731f0c4", + "treeHash": "c7d88a1ae221c2b0329252d98ea121ca71df09fecc541514026061b32a61d593", + "generatedAt": "2025-11-28T10:17:05.202412Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "incident-response", + "description": "Production incident response system with incident management workflows, intelligent debugging, runbook automation, and postmortem generation. Handles incident detection, triage, mitigation, and learning.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "b585ff201e3c9b84d9096861d7cb5abdada7d3bf95b2118651fae9b8df9df635" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "a0ed949c3531f23e6c7e6c456b52477639581b0c9ebbec5a317a95b9c7b876da" + }, + { + "path": "skills/incident-response/SKILL.md", + "sha256": "928e3705fde6a5527e8bd646f3ff2c0ac7e70ebc6187187adc24c747353b4acc" + }, + { + "path": "skills/incident-response/examples/INDEX.md", + "sha256": "bee7ea36bc509579227fc5c39c1e1ed8e3b5bcb8ec8743e13c93627d29d36052" + }, + { + "path": "skills/incident-response/templates/runbook-template.md", + "sha256": "18a5eb68fe72efd9e3087dca60397a95375056575e95de845df5bfa53f56034e" + }, + { + "path": "skills/incident-response/templates/incident-timeline-template.md", + "sha256": "55880253b3e7ebd6795aa71c5164560ae54ff8c2aaed6f398f45ff45bd40789b" + }, + { + "path": "skills/incident-response/templates/INDEX.md", + "sha256": "6d15e0703653b205b82d12165e97f9eb54e1986fc21ef0da1121713dcd61fe0f" + }, + { + "path": "skills/incident-response/templates/postmortem-template.md", + "sha256": "bd34a948c249b4cdbbc05de6457c29f2b9783631f3da4afedf49dfb98abea490" + }, + { + "path": "skills/incident-response/reference/communication-templates.md", + "sha256": "c4da7566759cf1b6d9a97be3b81b03f53bc90bad92850f4df8cc90f8650ddcd3" + }, + { + "path": "skills/incident-response/reference/INDEX.md", + "sha256": "50e6fc6a7aa8e53ae574e0ac0dcac34d5df55e91154a9b9f746a7e0bc99a11a9" + }, + { + "path": "skills/incident-response/reference/rca-techniques.md", + "sha256": "2c95fae04335da03f56102738a51ec4d0b1ddb58b28689b672f13a8f0f554857" + }, + { + "path": "skills/smart-debugging/SKILL.md", + "sha256": "90d5513d49b94433cf7d683cae85c4093c823ff617f6c5a181fa3f1c0f0aebe4" + }, + { + "path": "skills/smart-debugging/checklists/systematic-debugging-checklist.md", + "sha256": "eed5b6e576558bbf6f9c993da3684244e6eb7a9dff021a0d8e430a6b75678639" + }, + { + "path": "skills/smart-debugging/examples/performance-bug-debug.md", + "sha256": "20e779019c720e079b2488dddf51bd49608dedaf5460f0cfe0cad327700b688c" + }, + { + "path": "skills/smart-debugging/examples/type-error-debug-example.md", + "sha256": "d2a67a222696a79a875ed50f22af8f058738db1ee9a303214fcbb7471c3084d0" + }, + { + "path": "skills/smart-debugging/examples/INDEX.md", + "sha256": "5d90a0784401c7e62a897075f2c6ce749f0845eca427b1b9b0e4a3b965027e07" + }, + { + "path": "skills/smart-debugging/examples/integration-failure-debug.md", + "sha256": "cba4c16364e7da33fb48b55f0616bd8bb5a8fa34e130f8942e672ff92353ecec" + }, + { + "path": "skills/smart-debugging/examples/null-pointer-debug-example.md", + "sha256": "67e9c3f87bae7c9d632fdfee6927c96ad9175de1fa89a3ff3b9325014d3e1f0c" + }, + { + "path": "skills/smart-debugging/templates/rca-template.md", + "sha256": "e99f71a4ffa70e5f53d43b0c8c09bef038751fe940878cf0f240a2904c3e304f" + }, + { + "path": "skills/smart-debugging/reference/error-patterns-database.md", + "sha256": "f362cc5eb4391d6f07f45fa86b0f5a4b47bf70ccfdf28495910afd5c36c7bf99" + }, + { + "path": "skills/smart-debugging/reference/rca-methodology.md", + "sha256": "890285c0497b5f8755f017fca66a2a7efac18fe6e84d0328f01e780469683e62" + }, + { + "path": "skills/smart-debugging/reference/INDEX.md", + "sha256": "53b38a8afd5fa6ff071bc9cc4c6c0ed0cb80dfbfb4840c3d750d2646fa1e8c17" + }, + { + "path": "skills/smart-debugging/reference/stack-trace-patterns.md", + "sha256": "62d82fdaab7d779817c81f3b37b35202479a8fcbc8b4db93b0fa4ef96dcd2e50" + }, + { + "path": "skills/smart-debugging/reference/fix-generation-patterns.md", + "sha256": "9a6e4e25ea39ef513647da386624c476aa1d3a810e1c956b3cc932bf0508197e" + } + ], + "dirSha256": "c7d88a1ae221c2b0329252d98ea121ca71df09fecc541514026061b32a61d593" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/incident-response/SKILL.md b/skills/incident-response/SKILL.md new file mode 100644 index 0000000..c7197e7 --- /dev/null +++ b/skills/incident-response/SKILL.md @@ -0,0 +1,26 @@ +# Incident Response Skill + +Handle production incidents with SRE best practices including detection, investigation, mitigation, recovery, and postmortems. + +## Description + +Production incident response following SRE methodologies with incident timeline tracking, RCA documentation, and runbook updates. + +## What's Included + +- **Examples**: SEV1 incident handling, postmortem templates +- **Reference**: SRE best practices, incident severity levels +- **Templates**: Incident reports, RCA documents, runbook updates + +## Use When + +- Production outages +- SEV1/SEV2 incidents +- Postmortem creation +- Runbook updates + +## Related Agents + +- `incident-responder` + +**Skill Version**: 1.0 diff --git a/skills/incident-response/examples/INDEX.md b/skills/incident-response/examples/INDEX.md new file mode 100644 index 0000000..7dfce37 --- /dev/null +++ b/skills/incident-response/examples/INDEX.md @@ -0,0 +1,122 @@ +# Incident Response Examples + +Real-world production incident examples demonstrating systematic incident response, root cause analysis, mitigation strategies, and blameless postmortems. + +## Available Examples + +### SEV1: Critical Database Outage + +**File**: [sev1-critical-database-outage.md](sev1-critical-database-outage.md) + +Complete database failure causing total service outage: +- **Incident**: PostgreSQL primary failure, 100% error rate +- **Impact**: All services down, $50K revenue loss/hour +- **Root Cause**: Disk full on primary, replication lag spike +- **Resolution**: Promoted replica, cleared disk space, restored service +- **MTTR**: 45 minutes (detection → full recovery) +- **Prevention**: Disk monitoring alerts, automatic disk cleanup, replica promotion automation + +**Key Learnings**: +- Importance of replica promotion runbooks +- Disk space monitoring thresholds +- Automated failover procedures + +--- + +### SEV2: API Performance Degradation + +**File**: [sev2-api-performance-degradation.md](sev2-api-performance-degradation.md) + +Gradual performance degradation due to memory leak: +- **Incident**: API p95 latency 200ms → 5000ms over 2 hours +- **Impact**: 30% of users affected, slow page loads +- **Root Cause**: Memory leak in worker process, OOM killing workers +- **Resolution**: Identified leak with heap snapshot, deployed fix, restarted workers +- **MTTR**: 3 hours (detection → permanent fix) +- **Prevention**: Memory profiling in CI/CD, heap snapshot automation, worker restart automation + +**Key Learnings**: +- Early detection through gradual alerts +- Heap snapshot analysis for memory leaks +- Temporary mitigation (worker restarts) vs permanent fix + +--- + +### SEV3: Feature Flag Misconfiguration + +**File**: [sev3-feature-flag-misconfiguration.md](sev3-feature-flag-misconfiguration.md) + +Feature flag enabled for wrong audience causing confusion: +- **Incident**: Experimental feature shown to 20% of production users +- **Impact**: 200 support tickets, user confusion, no revenue impact +- **Root Cause**: Feature flag percentage set to 20% instead of 0% +- **Resolution**: Disabled flag, sent customer communication, updated flag process +- **MTTR**: 30 minutes (detection → resolution) +- **Prevention**: Feature flag code review, staging validation, gradual rollout process + +**Key Learnings**: +- Feature flag validation before production +- Importance of clear documentation +- Quick rollback procedures + +--- + +### Distributed Tracing Investigation + +**File**: [distributed-tracing-investigation.md](distributed-tracing-investigation.md) + +Using Jaeger distributed tracing to find microservice bottleneck: +- **Incident**: Checkout API slow (3s p95), unclear which service +- **Investigation**: Used Jaeger to trace request flow across 7 microservices +- **Root Cause**: Payment service calling external API synchronously (2.8s) +- **Resolution**: Moved external API call to async background job +- **Impact**: p95 latency 3000ms → 150ms (95% faster) + +**Key Learnings**: +- Power of distributed tracing for microservices +- Synchronous external dependencies are dangerous +- Background jobs for non-critical operations + +--- + +### Cascade Failure Prevention + +**File**: [cascade-failure-prevention.md](cascade-failure-prevention.md) + +Preventing cascade failure through circuit breakers and bulkheads: +- **Incident**: Auth service down, caused all dependent services to fail +- **Impact**: Complete outage instead of graceful degradation +- **Root Cause**: No circuit breakers, all services retrying auth indefinitely +- **Resolution**: Implemented circuit breakers, bulkhead isolation, fallback logic +- **Prevention**: Circuit breaker pattern, timeout configuration, graceful degradation + +**Key Learnings**: +- Circuit breakers prevent cascade failures +- Bulkhead isolation limits blast radius +- Fallback logic enables graceful degradation + +--- + +## Learning Outcomes + +After studying these examples, you will understand: + +1. **Incident Classification**: How to assess severity (SEV1-SEV4) based on impact +2. **Incident Command**: Role of IC, communication protocols, timeline management +3. **Root Cause Analysis**: 5 Whys, timeline reconstruction, data-driven investigation +4. **Mitigation Strategies**: Immediate actions, temporary fixes, permanent solutions +5. **Blameless Postmortems**: Focus on systems not people, actionable items, continuous improvement +6. **Communication**: Internal updates, external communications, executive briefings +7. **Prevention**: Monitoring improvements, runbook automation, architectural changes + +--- + +## Related Documentation + +- **Reference**: [Reference Index](../reference/INDEX.md) - Severity matrix, communication templates, RCA techniques +- **Templates**: [Templates Index](../templates/INDEX.md) - Incident timeline, postmortem, runbook templates +- **Main Agent**: [incident-responder.md](../incident-responder.md) - Incident responder agent + +--- + +Return to [main agent](../incident-responder.md) diff --git a/skills/incident-response/reference/INDEX.md b/skills/incident-response/reference/INDEX.md new file mode 100644 index 0000000..439212c --- /dev/null +++ b/skills/incident-response/reference/INDEX.md @@ -0,0 +1,82 @@ +# Incident Response Reference + +Quick reference guides for incident severity classification, communication templates, root cause analysis techniques, and runbook structure. + +## Available References + +### Incident Severity Matrix + +**File**: [incident-severity-matrix.md](incident-severity-matrix.md) + +Complete severity classification guide with examples: +- **SEV1 (Critical)**: Complete outage, all customers affected, revenue stopped +- **SEV2 (Major)**: Partial degradation, significant customer impact +- **SEV3 (Minor)**: Isolated issues, workarounds available +- **SEV4 (Cosmetic)**: UI issues, no functional impact + +**Use when**: Classifying incident severity, determining escalation path + +--- + +### Communication Templates + +**File**: [communication-templates.md](communication-templates.md) + +Ready-to-use templates for all incident communications: +- Internal updates (Slack, email) +- External communications (status page, customer emails) +- Executive briefings +- Post-incident summaries +- Postmortem distribution + +**Use when**: Communicating during or after incidents + +--- + +### Root Cause Analysis Techniques + +**File**: [rca-techniques.md](rca-techniques.md) + +Comprehensive RCA methodology guide: +- **5 Whys**: Iterative questioning to find root cause +- **Fishbone Diagrams**: Category-based analysis +- **Timeline Reconstruction**: Event sequencing and correlation +- **Contributing Factors Analysis**: Immediate vs underlying vs latent causes +- **Hypothesis Testing**: Data-driven validation + +**Use when**: Conducting root cause analysis, writing postmortems + +--- + +### Runbook Structure Guide + +**File**: [runbook-structure-guide.md](runbook-structure-guide.md) + +Best practices for writing effective runbooks: +- Standard runbook template +- Diagnostic procedures +- Remediation steps +- Escalation paths +- Success criteria +- Runbook maintenance + +**Use when**: Creating or updating runbooks, automating diagnostics + +--- + +## Quick Links + +**By Use Case**: +- Need to classify incident severity → [Severity Matrix](incident-severity-matrix.md) +- Need to communicate during incident → [Communication Templates](communication-templates.md) +- Need to find root cause → [RCA Techniques](rca-techniques.md) +- Need to write a runbook → [Runbook Structure Guide](runbook-structure-guide.md) + +**Related Documentation**: +- **Examples**: [Examples Index](../examples/INDEX.md) - Real-world incident examples +- **Templates**: [Templates Index](../templates/INDEX.md) - Incident timeline, postmortem templates +- **Main Agent**: [incident-responder.md](../incident-responder.md) - Incident responder agent + +--- + +Return to [main agent](../incident-responder.md) diff --git a/skills/incident-response/reference/communication-templates.md b/skills/incident-response/reference/communication-templates.md new file mode 100644 index 0000000..3982f56 --- /dev/null +++ b/skills/incident-response/reference/communication-templates.md @@ -0,0 +1,213 @@ +# Communication Templates + +Copy-paste templates for incident communications across all channels and severity levels. + +## Internal Communications + +### SEV1 Incident Start + +``` +🚨 SEV1 INCIDENT DECLARED 🚨 +Incident ID: INC-YYYY-MM-DD-XXX +Impact: [Brief description - e.g., "100% outage, database down"] +Affected Services: [List services] +Customer Impact: [All users / X% of users / Specific feature unavailable] + +Roles: +- Incident Commander: @[name] +- Technical Lead: @[name] +- Communications Lead: @[name] + +War Room: +- Slack: #incident-XXX +- Zoom: [link] + +Status: Investigating +Next Update: [Time] (15 minutes or on status change) +Runbook: [Link if available] +``` + +### SEV2 Incident Start + +``` +⚠️ SEV2 INCIDENT +Incident ID: INC-YYYY-MM-DD-XXX +Impact: [Brief description - e.g., "API degraded, 30% users affected"] +Symptoms: [What users are experiencing] + +IC: @[name] +Status: Investigating [suspected cause] +Next Update: 30 minutes +``` + +### Incident Update + +``` +📊 UPDATE #[N] (T+[X] minutes) +Root Cause: [What we found OR "Still investigating"] +Mitigation: [What we're doing] +Impact: [Current status - improving/stable/worsening] +ETA: [Expected resolution time OR "Unknown"] +Next Update: [Time] +``` + +### Incident Resolved + +``` +🎉 INCIDENT RESOLVED (T+[X] minutes/hours) +Final Status: All services operational +Root Cause: [Brief summary] +Fix Applied: [What was done] +Monitoring: Ongoing for [duration] + +Postmortem: Scheduled for [date/time] +Timeline: [Link to detailed timeline] +``` + +## External Communications + +### Status Page - Investigating + +``` +🔴 INVESTIGATING - [Brief Title] + +We are investigating reports of [issue description]. +Our team is actively working to identify the cause. + +Affected: [Service names] +Started: [HH:MM UTC] +Next Update: [HH:MM UTC] +``` + +### Status Page - Identified + +``` +🟡 IDENTIFIED - [Issue Title] + +We have identified the issue as [brief cause]. +Our team is implementing a fix. + +Affected: [Service names] +Started: [HH:MM UTC] +Identified: [HH:MM UTC] +Est. Resolution: [HH:MM UTC] +``` + +### Status Page - Monitoring + +``` +🟢 MONITORING - [Issue Title] Resolved + +The issue has been resolved and services are operating normally. +We are monitoring to ensure stability. + +Started: [HH:MM UTC] +Resolved: [HH:MM UTC] +Duration: [X] minutes +``` + +### Customer Email - SEV1 Postmortem + +``` +Subject: Service Disruption - [Date] Postmortem + +Dear [Product Name] Customers, + +On [Date] at [Time UTC], we experienced a service disruption that affected [all users / X% of users] for approximately [duration]. + +What Happened: +[2-3 sentence summary of the incident] + +Impact: +- Duration: [X] minutes +- Affected Users: [percentage or description] +- Services Impacted: [list] + +Root Cause: +[1-2 sentence explanation of root cause] + +Resolution: +[1-2 sentences on how we fixed it] + +Prevention: +We have implemented the following measures to prevent recurrence: +1. [Measure 1] +2. [Measure 2] +3. [Measure 3] + +We sincerely apologize for the inconvenience and appreciate your patience. + +[Team Name] +[Company Name] +``` + +## Executive Briefings + +### Initial Notification (SEV1 only) + +``` +Subject: SEV1 Incident - [Brief Title] + +Summary: +- Incident ID: INC-YYYY-MM-DD-XXX +- Started: [HH:MM UTC] +- Impact: [All users affected / X% affected / revenue stopped] +- Status: [Investigating / Mitigation in progress] + +Current Situation: +[2-3 sentences explaining what's happening] + +Response: +- IC: [Name] +- Team: [X] engineers actively working +- ETA: [Time if known, "Unknown" if not] + +Business Impact: +- Revenue: [Estimated $ per hour OR "Minimal"] +- Customers: [Number affected] +- SLA: [Yes/No breach, details] + +Next Update: [Time] +``` + +### Resolution Summary (Executive) + +``` +Subject: SEV1 Resolved - [Brief Title] + +Incident INC-YYYY-MM-DD-XXX has been resolved after [duration]. + +Timeline: +- Started: [HH:MM UTC] +- Identified: [HH:MM UTC] +- Resolved: [HH:MM UTC] +- Total Duration: [X] minutes + +Impact: +- Customers Affected: [Number / percentage] +- Revenue Loss: [$X estimated] +- SLA Breach: [Yes/No] + +Root Cause: +[1-2 sentences] + +Resolution: +[1-2 sentences on fix] + +Prevention: +[2-3 key action items with owners and dates] + +Postmortem: Scheduled for [date/time] + +[IC Name] +``` + +## Related Documentation + +- **Severity Matrix**: [incident-severity-matrix.md](incident-severity-matrix.md) - When to use each template +- **Examples**: [Examples Index](../examples/INDEX.md) - Real communications from incidents +- **Templates**: [Templates Index](../templates/INDEX.md) - Full incident timeline and postmortem templates + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/incident-response/reference/rca-techniques.md b/skills/incident-response/reference/rca-techniques.md new file mode 100644 index 0000000..bc46e03 --- /dev/null +++ b/skills/incident-response/reference/rca-techniques.md @@ -0,0 +1,260 @@ +# Root Cause Analysis Techniques + +Comprehensive methods for identifying root causes through 5 Whys, Fishbone Diagrams, Timeline Reconstruction, and data-driven hypothesis testing. + +## 5 Whys Technique + +### Method + +Ask "Why?" iteratively until you reach the root cause (typically 5 levels deep, but can be 3-7). + +**Rules**: +1. Start with the problem statement +2. Ask "Why did this happen?" +3. Answer based on facts/data (not assumptions) +4. Repeat for each answer until root cause found +5. Root cause = **systemic issue** (not human error) + +### Example + +**Problem**: Database went down + +``` +Why 1: Why did the database go down? +→ Because the primary database ran out of disk space + +Why 2: Why did it run out of disk space? +→ Because PostgreSQL logs filled the entire disk (450GB) + +Why 3: Why did logs grow to 450GB? +→ Because log rotation was disabled + +Why 4: Why was log rotation disabled? +→ Because the `log_truncate_on_rotation` config was set to `off` during a migration + +Why 5: Why was this config change not caught? +→ Because configuration changes are not code-reviewed and there was no disk monitoring alert + +ROOT CAUSE: Missing disk monitoring alerts + configuration change without code review +``` + +**Action Items**: +- Add disk usage monitoring (>90% alert) +- Require code review for all config changes +- Enable log rotation on all databases + +--- + +## Fishbone Diagram (Ishikawa) + +### Method + +Categorize contributing factors into major categories to identify root cause systematically. + +**Categories** (6M's): +1. **Method** (Process) +2. **Machine** (Technology) +3. **Material** (Inputs/Data) +4. **Measurement** (Monitoring) +5. **Mother Nature** (Environment) +6. **Manpower** (People/Skills) + +### Example + +**Problem**: API performance degraded (p95: 200ms → 2000ms) + +``` + API Performance Degraded + │ + ┌───────────────────┬───────────┴───────────┬───────────────────┐ + │ │ │ │ + METHOD MACHINE MATERIAL MEASUREMENT + (Process) (Technology) (Data) (Monitoring) + │ │ │ │ + No memory EventEmitter Large dataset No heap + profiling in listeners leak processing snapshots + code review (not removed) (100K orders) in CI/CD + │ │ │ │ + No long-running Node.js v14 High traffic No gradual + load tests (old GC) spike (2x) alerts + (only 5min) (1h → 2h) +``` + +**Root Causes Identified**: +- **Machine**: EventEmitter leak (technical) +- **Measurement**: No heap monitoring (monitoring gap) +- **Method**: No memory profiling in code review (process gap) + +--- + +## Timeline Reconstruction + +### Method + +Build chronological timeline of events to identify causation and correlation. + +**Steps**: +1. Gather logs from all systems (with timestamps) +2. Normalize to UTC +3. Plot events chronologically +4. Identify cause-and-effect relationships +5. Find the triggering event + +### Example + +``` +12:00:00 - Normal operation (p95: 200ms, memory: 400MB) +12:15:00 - Code deployment (v2.15.4) +12:30:00 - Memory: 720MB (+80% in 15min) ⚠️ +12:45:00 - Memory: 1.2GB (+67% in 15min) ⚠️ +13:00:00 - Memory: 1.8GB (+50% in 15min) 🚨 +13:00:00 - p95 latency: 800ms (4x slower) +13:15:00 - Memory: 2.3GB (limit reached) +13:15:00 - Workers start OOMing +13:20:00 - p95 latency: 2000ms (10x slower) +13:30:00 - Alert fired: High latency +14:00:00 - Alert fired: High memory + +CORRELATION: +- Deployment at 12:15 → Memory growth starts at 12:30 +- Memory growth → Latency increase (correlated) +- TRIGGER: Code deployment v2.15.4 + +ACTION: Review code changes in v2.15.4 +``` + +--- + +## Contributing Factors Analysis + +### Levels of Causation + +**Immediate Cause** (What happened): +- Direct technical failure +- Example: EventEmitter listeners not removed + +**Underlying Conditions** (Why it was possible): +- Missing safeguards +- Example: No memory profiling in code review + +**Latent Failures** (Systemic weaknesses): +- Organizational/process gaps +- Example: No developer training on memory management + +### Example + +**Incident**: Memory leak in production + +``` +Immediate Cause: +└─ Code: EventEmitter .on() used without .removeListener() + +Underlying Conditions: +├─ No code review caught the issue +├─ No memory profiling in CI/CD +└─ Short load tests (5min) didn't reveal gradual leak + +Latent Failures: +├─ Team lacks memory management training +├─ No documentation on EventEmitter best practices +└─ Culture of "ship fast, fix later" +``` + +--- + +## Hypothesis Testing + +### Method + +Generate hypotheses, test with data, validate or reject. + +**Process**: +1. Observe symptoms +2. Generate hypotheses (educated guesses) +3. Design experiments to test each hypothesis +4. Collect data +5. Accept or reject hypothesis +6. Repeat until root cause found + +### Example + +**Symptom**: Checkout API slow (p95: 3000ms) + +**Hypothesis 1**: Database slow queries +``` +Test: Check slow query log +Data: All queries < 50ms ✅ +Result: REJECTED - database is fast +``` + +**Hypothesis 2**: External API slow +``` +Test: Distributed tracing (Jaeger) +Data: Fraud check API: 2750ms (91% of total time) 🚨 +Result: ACCEPTED - external API is bottleneck +``` + +**Hypothesis 3**: Network latency +``` +Test: curl timing breakdown +Data: DNS: 50ms, Connect: 30ms, Transfer: 2750ms +Result: PARTIAL - transfer is slow (not DNS/connect) +``` + +**Root Cause**: External fraud check API slow (blocking checkout) + +--- + +## Blameless RCA Principles + +### Core Tenets + +1. **Focus on Systems, Not People** + - ❌ "Engineer made a mistake" + - ✅ "Process didn't catch config error" + +2. **Assume Good Intent** + - Everyone did the best they could with information available + - Blame discourages honesty and learning + +3. **Multiple Contributing Factors** + - Never a single cause + - Usually 3-5 factors contribute + +4. **Actionable Improvements** + - Fix the system, not the person + - Concrete action items with owners + +### Example (Blameless vs Blame) + +**Blamefu (BAD)**: +``` +Root Cause: Engineer Jane deployed code without testing +Action Item: Remind Jane to test before deploying +``` + +**Blameless (GOOD)**: +``` +Root Cause: Deployment process allowed untested code to reach production +Contributing Factors: +1. No automated tests in CI/CD +2. Manual deployment process (prone to human error) +3. No staging environment validation + +Action Items: +1. Add automated tests to CI/CD (Owner: Mike, Due: Dec 20) +2. Require staging deployment + validation before production (Owner: Sarah, Due: Dec 22) +3. Implement deployment checklist (Owner: Alex, Due: Dec 18) +``` + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - RCA examples from real incidents +- **Severity Matrix**: [incident-severity-matrix.md](incident-severity-matrix.md) - When to perform RCA +- **Templates**: [Postmortem Template](../templates/postmortem-template.md) - Structured RCA format + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/incident-response/templates/INDEX.md b/skills/incident-response/templates/INDEX.md new file mode 100644 index 0000000..b4a5f25 --- /dev/null +++ b/skills/incident-response/templates/INDEX.md @@ -0,0 +1,76 @@ +# Incident Response Templates + +Ready-to-use templates for incident timelines, blameless postmortems, and runbooks. Copy and fill in for your incidents. + +## Available Templates + +### Incident Timeline Template + +**File**: [incident-timeline-template.md](incident-timeline-template.md) + +Real-time incident tracking template: +- Incident overview (ID, severity, impact) +- Chronological timeline (minute-by-minute) +- Role assignments (IC, Tech Lead, Comms) +- Status updates +- Resolution summary + +**Use when**: Tracking ongoing incident in real-time + +--- + +### Postmortem Template + +**File**: [postmortem-template.md](postmortem-template.md) + +Blameless postmortem template: +- Executive summary +- Timeline reconstruction +- Root cause analysis (5 Whys) +- Contributing factors +- Action items with owners +- Lessons learned + +**Use when**: Documenting incident after resolution (within 24-48 hours) + +--- + +### Runbook Template + +**File**: [runbook-template.md](runbook-template.md) + +Standard runbook structure: +- Problem description +- Diagnostic steps with commands +- Mitigation procedures +- Escalation paths +- Success criteria + +**Use when**: Creating new runbook or updating existing one + +--- + +## Template Usage + +**How to use**: +1. Copy template to your documentation system +2. Fill in all `[FILL IN]` sections +3. Remove optional sections if not applicable +4. Share with team for review + +**When to create**: +- **Incident Timeline**: As soon as SEV1/SEV2 declared (real-time) +- **Postmortem**: Within 24-48 hours of incident resolution +- **Runbook**: After any new incident type or process improvement + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - See completed examples +- **Reference**: [Reference Index](../reference/INDEX.md) - RCA techniques, communication templates +- **Main Agent**: [incident-responder.md](../incident-responder.md) - Incident responder agent + +--- + +Return to [main agent](../incident-responder.md) diff --git a/skills/incident-response/templates/incident-timeline-template.md b/skills/incident-response/templates/incident-timeline-template.md new file mode 100644 index 0000000..52bd01e --- /dev/null +++ b/skills/incident-response/templates/incident-timeline-template.md @@ -0,0 +1,147 @@ +# Incident Timeline: [INCIDENT TITLE] + +**Incident ID**: INC-YYYY-MM-DD-XXX +**Severity**: [SEV1 / SEV2 / SEV3] +**Status**: [Investigating / Mitigating / Resolved / Monitoring] +**Started**: [YYYY-MM-DD HH:MM UTC] + +--- + +## Incident Overview + +**Impact**: +- Customer Impact: [All users / X% of users / Specific feature] +- Services Affected: [List affected services] +- Error Rate: [X%] +- Revenue Impact: [$X estimated] + +**Symptoms**: +- [User-facing symptom 1] +- [User-facing symptom 2] +- [Metric: baseline → current] + +--- + +## Team + +**Incident Commander**: @[name] +**Technical Lead**: @[name] +**Communications Lead**: @[name] +**Scribe**: @[name] +**SMEs**: @[name1], @[name2] + +**Channels**: +- Slack: #incident-XXX +- Zoom: [link] +- Status Page: [link] + +--- + +## Timeline + +| Time (UTC) | Event | Action Taken | Owner | Status | +|------------|-------|--------------|-------|--------| +| [HH:MM] | [Alert fired / Issue detected] | [What was done] | @[name] | 🔴 Started | +| [HH:MM] | [IC joined] | [Declared severity, assigned roles] | @[IC] | 🔴 Investigating | +| [HH:MM] | [Discovery] | [What was found] | @[name] | 🔴 Investigating | +| [HH:MM] | [Root cause identified] | [What the root cause is] | @[name] | 🟡 Identified | +| [HH:MM] | [Mitigation started] | [What fix is being applied] | @[name] | 🟡 Mitigating | +| [HH:MM] | [Mitigation complete] | [Verification of fix] | @[name] | 🟢 Mitigated | +| [HH:MM] | [Incident resolved] | [All checks passing] | @[IC] | 🟢 Resolved | + +**Total Duration**: [X] minutes/hours + +--- + +## Status Updates + +### Update #1 ([HH:MM UTC] - T+[X] min) + +**Status**: [Investigating / Mitigating] +**Root Cause**: [Known / Unknown - investigating X] +**Current Actions**: [What team is doing] +**Impact**: [Current impact status] +**ETA**: [Estimated resolution time OR "Unknown"] +**Next Update**: [Time] + +### Update #2 ([HH:MM UTC] - T+[X] min) + +[Same format as Update #1] + +### Final Update ([HH:MM UTC] - T+[X] min) + +**Status**: Resolved +**Root Cause**: [Brief summary] +**Fix Applied**: [What was done] +**Impact**: Resolved +**Monitoring**: [Ongoing monitoring period] + +--- + +## Root Cause (Brief) + +**Immediate Cause**: [What directly caused the issue] + +**Contributing Factors**: +1. [Factor 1] +2. [Factor 2] +3. [Factor 3] + +--- + +## Resolution Summary + +**Temporary Fix** (if applicable): +- [What was done to quickly mitigate] +- [When it was applied] + +**Permanent Fix**: +- [What was done for long-term solution] +- [When it was applied] + +**Verification**: +- [How we confirmed the fix worked] +- [Metrics that returned to normal] + +--- + +## Communications + +### Internal + +- [HH:MM] - SEV1 declared in #incidents +- [HH:MM] - Update #1 posted +- [HH:MM] - Update #2 posted +- [HH:MM] - Resolution announced + +### External + +- [HH:MM] - Status page: "Investigating" +- [HH:MM] - Status page: "Identified" +- [HH:MM] - Status page: "Monitoring" +- [HH:MM] - Status page: "Resolved" +- [HH:MM] - Customer email sent (if applicable) + +### Executive + +- [HH:MM] - Initial notification to CTO/CEO (SEV1 only) +- [HH:MM] - Resolution summary sent + +--- + +## Next Steps + +- [ ] Full postmortem scheduled: [Date/Time] +- [ ] Action items created in Linear +- [ ] Runbook updated with new learnings +- [ ] Monitoring improvements identified + +--- + +## Notes + +[Any additional context, observations, or learnings captured during the incident] + +--- + +Return to [templates index](INDEX.md) diff --git a/skills/incident-response/templates/postmortem-template.md b/skills/incident-response/templates/postmortem-template.md new file mode 100644 index 0000000..d9b7090 --- /dev/null +++ b/skills/incident-response/templates/postmortem-template.md @@ -0,0 +1,187 @@ +# Postmortem: [INCIDENT TITLE] + +**Date**: [YYYY-MM-DD] +**Incident ID**: INC-YYYY-MM-DD-XXX +**Severity**: [SEV1 / SEV2 / SEV3] +**Author**: [Name] +**Reviewers**: [Names] +**Status**: [Draft / Final] + +--- + +## Executive Summary + +**What Happened**: [2-3 sentence summary of the incident] + +**Impact**: +- **Duration**: [X] minutes/hours +- **Users Affected**: [All / X% / specific group] +- **Revenue Impact**: [$X estimated loss] +- **SLA Breach**: [Yes/No - details] + +**Root Cause**: [1 sentence root cause] + +**Resolution**: [1 sentence how it was fixed] + +**Key Actions**: [3 most important action items] + +--- + +## Timeline + +| Time (UTC) | Event | Notes | +|------------|-------|-------| +| [HH:MM] | [Event] | [Context] | +| [HH:MM] | [Event] | [Context] | +| [HH:MM] | [Event] | [Context] | + +**Duration Breakdown**: +- Detection → Identification: [X] minutes +- Identification → Mitigation: [X] minutes +- Mitigation → Full Resolution: [X] minutes +- **Total MTTR**: [X] minutes + +--- + +## Root Cause Analysis (5 Whys) + +**Why 1**: Why did [problem] happen? +→ [Answer based on facts] + +**Why 2**: Why did [previous answer] happen? +→ [Answer based on facts] + +**Why 3**: Why did [previous answer] happen? +→ [Answer based on facts] + +**Why 4**: Why did [previous answer] happen? +→ [Answer based on facts] + +**Why 5**: Why did [previous answer] happen? +→ [Answer based on facts] + +**ROOT CAUSE**: [Final systemic issue identified] + +--- + +## Contributing Factors + +### Immediate Cause +[Direct technical cause of the incident] + +### Underlying Conditions +1. [Condition that enabled the immediate cause] +2. [Condition that enabled the immediate cause] + +### Latent Failures +1. [Organizational/process weakness] +2. [Organizational/process weakness] + +--- + +## What Went Well ✅ + +1. [Something that worked well during response] +2. [Something that worked well during response] +3. [Something that worked well during response] + +--- + +## What Went Wrong ❌ + +1. [Something that didn't work or was missing] +2. [Something that didn't work or was missing] +3. [Something that didn't work or was missing] + +--- + +## Action Items + +| Priority | Action | Owner | Due Date | Status | Link | +|----------|--------|-------|----------|--------|------| +| P0 | [Critical - do immediately] | @[name] | [Date] | [ ] | [Link] | +| P1 | [Important - do within 1 week] | @[name] | [Date] | [ ] | [Link] | +| P2 | [Nice to have - do within 1 month] | @[name] | [Date] | [ ] | [Link] | + +### P0 Actions (Immediate) +- [ ] [Action 1] - @[owner] - [due date] +- [ ] [Action 2] - @[owner] - [due date] + +### P1 Actions (Short-Term) +- [ ] [Action 1] - @[owner] - [due date] +- [ ] [Action 2] - @[owner] - [due date] + +### P2 Actions (Long-Term) +- [ ] [Action 1] - @[owner] - [due date] +- [ ] [Action 2] - @[owner] - [due date] + +--- + +## Lessons Learned + +### Technical Learnings +1. [Technical insight gained] +2. [Technical insight gained] + +### Process Learnings +1. [Process improvement identified] +2. [Process improvement identified] + +### Communication Learnings +1. [Communication improvement identified] +2. [Communication improvement identified] + +--- + +## Prevention Measures + +### Immediate (Completed) +- [x] [What was done same day] +- [x] [What was done same day] + +### Short-Term (1-2 weeks) +- [ ] [What will be done soon] +- [ ] [What will be done soon] + +### Long-Term (1-3 months) +- [ ] [What will be done eventually] +- [ ] [What will be done eventually] + +--- + +## Related Incidents + +- [INC-YYYY-MM-DD-XXX] - [Brief description] - [Link] +- [INC-YYYY-MM-DD-XXX] - [Brief description] - [Link] + +--- + +## Appendix + +### Relevant Logs +``` +[Paste key log entries] +``` + +### Metrics/Graphs +[Links to Grafana dashboards, screenshots] + +### Commands Run +```bash +[Commands that were used during investigation/mitigation] +``` + +--- + +## Sign-Off + +**Incident Commander**: [Name] - [Date] +**Technical Lead**: [Name] - [Date] +**Engineering Manager**: [Name] - [Date] + +**Postmortem Review**: [Date/Time] +**Attendees**: [List of people who reviewed] + +--- + +Return to [templates index](INDEX.md) diff --git a/skills/incident-response/templates/runbook-template.md b/skills/incident-response/templates/runbook-template.md new file mode 100644 index 0000000..184ca8f --- /dev/null +++ b/skills/incident-response/templates/runbook-template.md @@ -0,0 +1,255 @@ +# Runbook: [PROBLEM TITLE] + +**Alert**: [Alert name that triggers this runbook] +**Severity**: [SEV1 / SEV2 / SEV3] +**Owner**: [Team name] +**Last Updated**: [YYYY-MM-DD] +**Last Tested**: [YYYY-MM-DD] + +--- + +## Problem Description + +[2-3 sentence description of what this problem is] + +**Symptoms**: +- [Observable symptom 1 - what users/operators see] +- [Observable symptom 2] +- [Observable symptom 3] + +**Impact**: +- **Customer Impact**: [What users experience] +- **Business Impact**: [Revenue, SLA, compliance] +- **Affected Services**: [List of services] + +--- + +## Prerequisites + +**Required Access**: +- [ ] Kubernetes cluster access (`kubectl` configured) +- [ ] Database access (PlanetScale/PostgreSQL) +- [ ] Cloudflare Workers access (`wrangler` configured) +- [ ] Monitoring access (Grafana, Datadog) + +**Required Tools**: +- [ ] `kubectl` v1.28+ +- [ ] `wrangler` v3+ +- [ ] `pscale` CLI +- [ ] `curl`, `jq` + +--- + +## Diagnosis + +### Step 1: [Check Initial Symptom] + +**What to check**: [Describe what this step verifies] + +```bash +# Command to run +[command] + +# Expected output (healthy): +[what you should see if everything is fine] + +# Problem indicator: +[what you see if there's an issue] +``` + +**Interpretation**: +- If [condition], then [conclusion] +- If [condition], then go to Step 2 + +--- + +### Step 2: [Verify Root Cause] + +```bash +# Command to run +[command] + +# Look for: +[what to look for in the output] +``` + +**Possible Causes**: +1. **[Cause 1]**: [How to identify] → Go to [Mitigation Option A](#option-a-cause-1) +2. **[Cause 2]**: [How to identify] → Go to [Mitigation Option B](#option-b-cause-2) +3. **[Cause 3]**: [How to identify] → Escalate to [team] + +--- + +### Step 3: [Additional Verification] + +[Only if needed for complex scenarios] + +```bash +# Commands +[commands] +``` + +--- + +## Mitigation + +### Option A: [Cause 1] + +**When to use**: [Conditions when this mitigation applies] + +**Steps**: +1. [Action 1] + ```bash + [command] + ``` + +2. [Action 2] + ```bash + [command] + ``` + +3. [Action 3] + ```bash + [command] + ``` + +**Verification**: +```bash +# Check that mitigation worked +[verification command] + +# Expected result: +[what you should see] +``` + +**If mitigation fails**: [What to do next - usually escalate] + +--- + +### Option B: [Cause 2] + +[Same format as Option A] + +--- + +## Rollback + +**If mitigation makes things worse:** + +```bash +# Rollback command 1 +[command to undo action 1] + +# Rollback command 2 +[command to undo action 2] +``` + +--- + +## Verification & Monitoring + +### Health Checks + +After mitigation, verify these metrics return to normal: + +```bash +# Check 1: Service health +curl https://api.greyhaven.io/health +# Expected: HTTP 200, {"status": "healthy"} + +# Check 2: Error rate +# Grafana: Error Rate dashboard +# Expected: <0.1% + +# Check 3: Latency +# Grafana: API Latency dashboard +# Expected: p95 <500ms +``` + +### Monitoring Period + +Monitor for **[time period]** after mitigation: +- [ ] Error rate stable (<0.1%) +- [ ] Latency normal (p95 <500ms) +- [ ] No new alerts +- [ ] User reports resolved + +--- + +## Escalation + +**Escalate if**: +- Mitigation doesn't work after [X] minutes +- Root cause unclear after diagnosis +- Issue is [severity] and unresolved after [X] minutes +- Multiple services affected + +**Escalation Path**: +``` +0-15 min: @oncall-engineer +15-30 min: @team-lead +30-60 min: @engineering-manager +60+ min: @vp-engineering (SEV1 only) +``` + +**Escalation Contact**: +- Team Slack: #[team-channel] +- PagerDuty: [escalation policy] +- Oncall: @[oncall-alias] + +--- + +## Common Mistakes + +### Mistake 1: [Common Error] + +**Wrong**: +```bash +[incorrect command or approach] +``` + +**Correct**: +```bash +[correct command or approach] +``` + +### Mistake 2: [Common Error] + +[Description and correction] + +--- + +## Related Documentation + +- **Alert Definition**: [Link to alert config] +- **Monitoring Dashboard**: [Link to Grafana] +- **Architecture Doc**: [Link to system architecture] +- **Past Incidents**: [Links to similar incidents] +- **Postmortems**: [Links to related postmortems] + +--- + +## Changelog + +| Date | Author | Changes | +|------|--------|---------| +| [YYYY-MM-DD] | @[name] | Initial creation | +| [YYYY-MM-DD] | @[name] | Updated [what changed] | + +--- + +## Testing Notes + +**Last Test Date**: [YYYY-MM-DD] +**Test Result**: [Pass / Fail] +**Notes**: [What was learned from testing] + +**How to Test**: +1. [Step to simulate failure in staging] +2. [Follow runbook] +3. [Verify recovery] +4. [Document time taken and any issues] + +--- + +Return to [templates index](INDEX.md) diff --git a/skills/smart-debugging/SKILL.md b/skills/smart-debugging/SKILL.md new file mode 100644 index 0000000..6809c55 --- /dev/null +++ b/skills/smart-debugging/SKILL.md @@ -0,0 +1,26 @@ +# Smart Debugging Skill + +AI-powered intelligent debugging with stack trace analysis, error pattern recognition, and automated fix suggestions. + +## Description + +Systematic error diagnosis through triage, investigation, root cause analysis, fix generation, and prevention strategies. + +## What's Included + +- **Examples**: Stack trace analysis, error pattern fixes +- **Reference**: Debugging methodologies, common errors +- **Templates**: Debug report templates, RCA structures + +## Use When + +- Systematic error diagnosis needed +- Stack traces to analyze +- Root cause investigation +- Automated fix suggestions + +## Related Agents + +- `smart-debug` + +**Skill Version**: 1.0 diff --git a/skills/smart-debugging/checklists/systematic-debugging-checklist.md b/skills/smart-debugging/checklists/systematic-debugging-checklist.md new file mode 100644 index 0000000..5b49e41 --- /dev/null +++ b/skills/smart-debugging/checklists/systematic-debugging-checklist.md @@ -0,0 +1,280 @@ +# Systematic Debugging Checklist + +**Use when debugging errors, exceptions, or unexpected behavior.** + +## Phase 1: Triage (2-5 minutes) + +- [ ] Error message captured completely +- [ ] Stack trace obtained (full, not truncated) +- [ ] Environment identified (production, staging, development) +- [ ] Severity assessed (SEV1, SEV2, SEV3, SEV4) +- [ ] Error frequency determined (one-time, intermittent, consistent) +- [ ] First occurrence timestamp identified +- [ ] Recent changes reviewed (deployments, config changes) +- [ ] Production impact assessed (users affected, revenue impact) + +### Triage Decision +- [ ] **SEV1 (Production Down)** - Escalate to incident-responder immediately +- [ ] **SEV2 (Degraded)** - Quick investigation (10 min max), then escalate if unresolved +- [ ] **SEV3 (Bug)** - Continue with full smart-debug workflow +- [ ] **SEV4 (Enhancement)** - Document and queue for later + +## Phase 2: Stack Trace Analysis + +- [ ] Error type identified (TypeError, ValueError, KeyError, etc.) +- [ ] Error message parsed and understood +- [ ] Call stack extracted (all frames with file, line, function) +- [ ] Root file identified (where error originated, not propagated) +- [ ] Root line number identified +- [ ] Stdlib/third-party frames filtered out +- [ ] Related files identified (files in call stack) +- [ ] Likely cause predicted using pattern matching + +### Stack Trace Quality +- [ ] Stack trace is complete (not truncated) +- [ ] Source maps applied (for minified JavaScript) +- [ ] Line numbers accurate (code and deployed version match) + +## Phase 3: Pattern Matching + +- [ ] Error pattern database searched +- [ ] Matching error pattern found (or "unknown") +- [ ] Root cause hypothesis generated +- [ ] Fix template identified +- [ ] Prevention strategy identified +- [ ] Similar historical bugs reviewed (if available) + +### Common Patterns Checked +- [ ] Null pointer / NoneType errors +- [ ] Type mismatch errors +- [ ] Index out of range errors +- [ ] Missing dictionary keys +- [ ] Module/import errors +- [ ] Database connection errors +- [ ] API contract violations +- [ ] Concurrency errors + +## Phase 4: Code Inspection + +- [ ] Root file read completely +- [ ] Problematic line identified +- [ ] Context examined (5 lines before/after) +- [ ] Function signature examined +- [ ] Variable types inferred +- [ ] Data flow traced (inputs to problematic line) +- [ ] Assumptions identified (null checks, type validations missing) + +### Code Quality Check +- [ ] Tests exist for this code path (yes/no) +- [ ] Code has type hints (TypeScript, Python type hints) +- [ ] Code has input validation +- [ ] Code has error handling + +## Phase 5: Observability Investigation + +### Log Analysis +- [ ] Logs queried for error occurrences +- [ ] Error frequency calculated (per hour, per day) +- [ ] First occurrence timestamp confirmed +- [ ] Recent occurrences reviewed (last 10) +- [ ] Affected users identified (user IDs from logs) +- [ ] Error correlation checked (other errors at same time) + +### Metrics Analysis +- [ ] Error rate queried (Prometheus, Cloudflare Analytics) +- [ ] Error spike identified (yes/no, when) +- [ ] Correlation with traffic spike checked +- [ ] Correlation with deployment checked +- [ ] Resource utilization checked (CPU, memory, connections) + +### Trace Analysis +- [ ] Trace ID extracted from logs +- [ ] Distributed trace viewed (Jaeger, Zipkin) +- [ ] Span timings analyzed +- [ ] Upstream/downstream services checked +- [ ] Trace context propagation verified + +## Phase 6: Reproduce Locally + +- [ ] Test environment set up (matches production config) +- [ ] Input data identified (from logs or user report) +- [ ] Reproduction steps documented +- [ ] Error reproduced locally (consistent reproduction) +- [ ] Minimal reproduction case created (simplest input that triggers error) +- [ ] Failing test case written (pytest, vitest) +- [ ] Test runs and fails as expected + +### Reproduction Quality +- [ ] Reproduction is reliable (100% reproducible) +- [ ] Reproduction is minimal (fewest steps possible) +- [ ] Test is isolated (no external dependencies if possible) + +## Phase 7: Fix Generation + +- [ ] Fix hypothesis generated (based on pattern match and code inspection) +- [ ] Fix option 1 generated (quick fix) +- [ ] Fix option 2 generated (robust fix) +- [ ] Fix option 3 generated (best practice fix) +- [ ] Trade-offs analyzed (complexity vs. robustness) +- [ ] Fix option selected (document rationale) + +### Fix Options Evaluated +- [ ] **Quick fix** - Minimal code change, may not cover all cases +- [ ] **Robust fix** - Handles edge cases, more defensive +- [ ] **Best practice fix** - Follows design patterns, prevents similar bugs + +## Phase 8: Fix Application + +- [ ] Code changes made (Edit or MultiEdit tool) +- [ ] Changes reviewed for correctness +- [ ] No new bugs introduced (code review) +- [ ] Code style consistent (matches project conventions) +- [ ] Type hints added (if applicable) +- [ ] Comments added (explaining why, not what) + +### Safety Checks +- [ ] No hardcoded values (use constants or config) +- [ ] No security vulnerabilities introduced +- [ ] No performance regressions introduced +- [ ] Backwards compatibility maintained (if API change) + +## Phase 9: Test Verification + +- [ ] Failing test now passes (verify fix works) +- [ ] Full test suite runs (pytest, vitest) +- [ ] All tests pass (no regressions) +- [ ] Code coverage maintained or improved +- [ ] Integration tests pass (if applicable) +- [ ] Edge cases tested (null, empty, large inputs) + +### Test Quality +- [ ] Test is clear and readable +- [ ] Test documents expected behavior +- [ ] Test will catch regressions +- [ ] Test runs quickly (<1 second) + +## Phase 10: Root Cause Analysis + +- [ ] 5 Whys analysis performed +- [ ] True root cause identified (not just symptom) +- [ ] Contributing factors identified +- [ ] Timeline reconstructed (what happened when) +- [ ] RCA document created (using template) + +### RCA Document Contents +- [ ] Error summary (what, where, when, impact) +- [ ] Timeline of events +- [ ] Investigation steps documented +- [ ] Root cause clearly stated +- [ ] Fix applied documented (with code snippet) +- [ ] Prevention strategy documented + +## Phase 11: Prevention Strategy + +- [ ] Immediate prevention: Unit test added (prevents this specific bug) +- [ ] Short-term prevention: Integration tests added (prevents similar bugs) +- [ ] Long-term prevention: Architecture changes proposed (prevents class of bugs) +- [ ] Monitoring added: Alert created (detects recurrence) +- [ ] Documentation updated: Runbook created (guides future debugging) + +### Prevention Levels +- [ ] **Test Coverage** - Tests prevent regression +- [ ] **Type Safety** - Type hints catch errors at dev time +- [ ] **Input Validation** - Validates data early (Pydantic, zod) +- [ ] **Error Handling** - Graceful degradation +- [ ] **Monitoring** - Detects issues quickly +- [ ] **Documentation** - Team learns from incident + +## Phase 12: Deploy & Monitor + +### Pre-Deployment +- [ ] Fix tested in staging environment +- [ ] Performance impact assessed (no significant regression) +- [ ] Security review completed (if security-related bug) +- [ ] Deployment plan created (gradual rollout, rollback plan) +- [ ] Stakeholders notified (if high-impact bug) + +### Deployment +- [ ] Fix deployed to staging first +- [ ] Staging verification successful +- [ ] Fix deployed to production (gradual rollout if possible) +- [ ] Deployment monitoring active (logs, metrics, traces) + +### Post-Deployment +- [ ] Error logs monitored (1 hour post-deploy) +- [ ] Error rate confirmed to zero (or significantly reduced) +- [ ] No new errors introduced +- [ ] No performance degradation +- [ ] User reports checked (customer support, social media) + +### Monitoring Duration +- [ ] 1 hour: Active monitoring (logs, errors, metrics) +- [ ] 24 hours: Passive monitoring (alerting enabled) +- [ ] 1 week: Review error trends (ensure no recurrence) + +## Phase 13: Documentation & Learning + +- [ ] Error pattern database updated (add new pattern if discovered) +- [ ] Team notified of fix (Slack, email) +- [ ] Postmortem conducted (if SEV1 or SEV2) +- [ ] Lessons learned documented +- [ ] Similar code locations reviewed (apply fix broadly if needed) +- [ ] Architecture improvements proposed (if needed) + +### Knowledge Sharing +- [ ] RCA document shared with team +- [ ] Runbook created or updated +- [ ] Presentation given (if interesting or impactful bug) +- [ ] Blog post written (if educational value) + +## Critical Validations + +- [ ] Bug reliably reproduced before fixing +- [ ] Fix verified with passing test +- [ ] No regressions introduced +- [ ] Root cause identified (not just symptom fixed) +- [ ] Prevention strategy implemented +- [ ] Monitoring in place to detect recurrence +- [ ] Documentation complete (RCA, runbook) +- [ ] Team learns from incident + +## Debugging Anti-Patterns (Avoid These) + +- [X] Random code changes without hypothesis +- [X] Adding print statements without plan +- [X] Debugging production directly (use staging) +- [X] Ignoring error messages or stack traces +- [X] Not writing tests to verify fix +- [X] Fixing symptoms instead of root cause +- [X] Skipping reproduction step +- [X] Not documenting investigation +- [X] Not learning from mistakes (no RCA) +- [X] Working alone for > 30 min when stuck + +## When to Escalate + +**Escalate immediately if**: +- Production is down (SEV1) +- You're stuck for > 30 minutes with no progress +- Bug is in unfamiliar code/system +- Security vulnerability suspected +- Data corruption suspected +- Multiple systems affected + +**Who to escalate to**: +- **incident-responder** - Production SEV1/SEV2 +- **performance-optimizer** - Performance bugs +- **security-analyzer** - Security vulnerabilities +- **data-validator** - Data validation errors +- **Senior engineer** - Stuck for > 30 min +- **On-call engineer** - Outside business hours + +## Success Criteria + +- [ ] Bug fixed and verified with test +- [ ] Root cause identified and documented +- [ ] Prevention strategy implemented +- [ ] Team learns from incident +- [ ] Similar bugs prevented in future +- [ ] Documentation complete and accurate +- [ ] Debugging completed in reasonable time (< 2 hours for SEV3) diff --git a/skills/smart-debugging/examples/INDEX.md b/skills/smart-debugging/examples/INDEX.md new file mode 100644 index 0000000..59a1970 --- /dev/null +++ b/skills/smart-debugging/examples/INDEX.md @@ -0,0 +1,52 @@ +# Smart Debug Examples + +Complete examples demonstrating systematic debugging workflows from error triage to verified fixes. + +## Available Examples + +### [null-pointer-debug-example.md](null-pointer-debug-example.md) +Complete walkthrough of debugging a NoneType AttributeError. +- Stack trace analysis and root file identification +- Error pattern matching (null pointer pattern) +- Code inspection of problematic function +- Fix generation with 3 options (return early, default value, exception) +- Test-driven debugging with failing test creation +- Fix application and verification +- Root cause analysis using 5 Whys +- Prevention strategy implementation + +### [type-error-debug-example.md](type-error-debug-example.md) +Debugging type mismatch and operand type errors. +- TypeError analysis (unsupported operand types) +- Type inference from stack trace +- Pattern matching for type mismatches +- Type validation fix generation +- Unit test creation for type validation +- Static analysis recommendations (mypy, Pydantic) +- Prevention through type hints + +### [integration-failure-debug.md](integration-failure-debug.md) +Debugging API integration failures and contract violations. +- HTTP error analysis (400, 422, 500 responses) +- API contract validation against OpenAPI spec +- Request/response comparison +- Schema validation with Pydantic +- Integration test creation +- Observability integration (trace ID correlation) +- Rollback and deployment strategies + +### [performance-bug-debug.md](performance-bug-debug.md) +Debugging performance-related bugs and slow queries. +- Performance profiling with cProfile +- Database query analysis (N+1 detection) +- Caching strategy implementation +- Optimization verification with benchmarks +- Delegation to performance-optimizer agent +- Production monitoring setup + +## Quick Reference + +**Need null pointer help?** → [null-pointer-debug-example.md](null-pointer-debug-example.md) +**Need type error help?** → [type-error-debug-example.md](type-error-debug-example.md) +**Need API debugging?** → [integration-failure-debug.md](integration-failure-debug.md) +**Need performance debugging?** → [performance-bug-debug.md](performance-bug-debug.md) diff --git a/skills/smart-debugging/examples/integration-failure-debug.md b/skills/smart-debugging/examples/integration-failure-debug.md new file mode 100644 index 0000000..4baedc6 --- /dev/null +++ b/skills/smart-debugging/examples/integration-failure-debug.md @@ -0,0 +1,88 @@ +# Integration Failure Debug Example + +Debugging API integration failures and contract violations. + +## Error: 422 Unprocessable Entity from Payment API + +```json +{ + "detail": [ + { + "loc": ["body", "amount"], + "msg": "ensure this value is greater than 0", + "type": "value_error.number.not_gt" + } + ] +} +``` + +## Investigation + +### Request Sent + +```python +# Our code +await payment_api.create_charge({ + "amount": order.total, # Sending cents: 0 (empty cart!) + "currency": "usd", + "customer_id": "cus_123" +}) +``` + +### API Contract (OpenAPI Spec) + +```yaml +/charges: + post: + requestBody: + content: + application/json: + schema: + properties: + amount: + type: integer + minimum: 50 # $0.50 minimum! +``` + +**Issue**: Sending `amount: 0` violates API's minimum amount requirement. + +## Root Cause + +Order validation allows empty carts ($0 total). Payment API requires minimum $0.50. + +## Fix + +```python +from pydantic import BaseModel, validator + +class CreateChargeRequest(BaseModel): + amount: int + currency: str + customer_id: str + + @validator('amount') + def amount_must_meet_minimum(cls, v): + if v < 50: # Match API's minimum + raise ValueError('Amount must be at least $0.50 (50 cents)') + return v + +# Service layer +async def create_charge(order: Order): + # Validate before API call + request = CreateChargeRequest( + amount=order.total_cents, + currency="usd", + customer_id=order.customer_id + ) + return await payment_api.create_charge(request.dict()) +``` + +## Prevention + +1. **Schema validation**: Validate against OpenAPI spec +2. **Contract tests**: Test API contract compliance +3. **Integration tests**: Test with real API (or mocks matching spec) + +--- + +**Result**: API contract violations caught at service boundary, not production. diff --git a/skills/smart-debugging/examples/null-pointer-debug-example.md b/skills/smart-debugging/examples/null-pointer-debug-example.md new file mode 100644 index 0000000..f912488 --- /dev/null +++ b/skills/smart-debugging/examples/null-pointer-debug-example.md @@ -0,0 +1,495 @@ +# Null Pointer Debug Example + +Complete walkthrough of debugging a NoneType AttributeError using smart-debug systematic methodology. + +## Error Encountered + +**Environment**: Production +**Severity**: SEV2 (Degraded service - user profile pages failing) +**Frequency**: 127 occurrences in last 24 hours +**First Occurrence**: 2025-01-16 14:23:00 UTC + +### Error Message + +```python +AttributeError: 'NoneType' object has no attribute 'name' +``` + +### User Report + +> "When I click on a user's profile after they've deleted their account, the page crashes with a 500 error instead of showing a 'User not found' message." + +## Phase 1: Triage (3 minutes) + +**Severity Assessment**: +- Not production down (SEV1) +- Affects specific user workflow (profile viewing) +- 127 occurrences = moderate frequency +- **Decision**: SEV2 - Proceed with full smart-debug workflow + +**Error Category**: Runtime Exception (NoneType error) + +## Phase 2: Stack Trace Analysis + +### Full Stack Trace + +```python +Traceback (most recent call last): + File "/app/api/users.py", line 42, in get_user_profile + return {"name": user.name, "email": user.email} + File "/app/models/user.py", line 89, in name + return self._name.upper() +AttributeError: 'NoneType' object has no attribute 'name' +``` + +### Pattern Match + +**Pattern**: `null_pointer` +**Indicators**: `'NoneType' object has no attribute` +**Likely Cause**: Accessing property on None value - check for null/undefined +**Fix Template**: Add null check before access + +## Phase 3: Code Inspection + +### Problematic Code (api/users.py:42) + +```python +@router.get("/users/{user_id}/profile") +async def get_user_profile(user_id: str): + """Get user profile information.""" + user = await db.users.find_one({"id": user_id}) + + # Line 42 - THE PROBLEM + return { + "name": user.name, + "email": user.email, + "created_at": user.created_at + } +``` + +**Issue Identified**: No null check! If `find_one()` returns `None` (user doesn't exist), accessing `user.name` causes AttributeError. + +### Root Cause + +When a user deletes their account: +1. User record is deleted from database +2. But references to that user ID may still exist (links, mentions, etc.) +3. When someone tries to view the deleted user's profile, `find_one()` returns `None` +4. Code tries to access `None.name` → AttributeError + +## Phase 4: Reproduce Locally + +### Test Case (Failing) + +```python +# tests/test_user_api.py +import pytest +from fastapi.testclient import TestClient + +def test_get_deleted_user_profile(client: TestClient, db): + """Test viewing profile of deleted user.""" + # Setup: Create and then delete a user + user_id = "usr_deleted_123" + db.users.insert_one({"id": user_id, "name": "Deleted User"}) + db.users.delete_one({"id": user_id}) + + # Action: Try to get profile of deleted user + response = client.get(f"/users/{user_id}/profile") + + # Expected: 404 Not Found, not 500 Internal Server Error + assert response.status_code == 404 + assert response.json() == {"detail": f"User {user_id} not found"} +``` + +### Run Test (Fails as Expected) + +```bash +$ pytest tests/test_user_api.py::test_get_deleted_user_profile -v + +tests/test_user_api.py::test_get_deleted_user_profile FAILED + +E assert 500 == 404 +E + where 500 = .status_code +``` + +✅ **Reproduction Successful** - Test reliably reproduces the bug. + +## Phase 5: Fix Generation + +### Option 1: Quick Fix (Return Early) + +```python +@router.get("/users/{user_id}/profile") +async def get_user_profile(user_id: str): + """Get user profile information.""" + user = await db.users.find_one({"id": user_id}) + + # Quick fix: Return early if user not found + if user is None: + raise HTTPException(status_code=404, detail=f"User {user_id} not found") + + return { + "name": user.name, + "email": user.email, + "created_at": user.created_at + } +``` + +**Pros**: Simple, fixes the immediate issue +**Cons**: Doesn't prevent similar issues elsewhere + +### Option 2: Robust Fix (Custom Exception) + +```python +# models/exceptions.py +class UserNotFoundError(Exception): + """Raised when user is not found in database.""" + def __init__(self, user_id: str): + self.user_id = user_id + super().__init__(f"User {user_id} not found") + +# api/users.py +@router.get("/users/{user_id}/profile") +async def get_user_profile(user_id: str): + """Get user profile information.""" + user = await db.users.find_one({"id": user_id}) + + if user is None: + raise UserNotFoundError(user_id) + + return { + "name": user.name, + "email": user.email, + "created_at": user.created_at + } + +# Global exception handler +@app.exception_handler(UserNotFoundError) +async def user_not_found_handler(request, exc): + return JSONResponse( + status_code=404, + content={"detail": str(exc)} + ) +``` + +**Pros**: Reusable, type-safe, better error handling +**Cons**: More boilerplate + +### Option 3: Best Practice (Helper Function) + +```python +# services/user_service.py +async def get_user_or_404(user_id: str) -> User: + """Get user by ID or raise 404.""" + user = await db.users.find_one({"id": user_id}) + if user is None: + raise HTTPException(status_code=404, detail=f"User {user_id} not found") + return user + +# api/users.py +@router.get("/users/{user_id}/profile") +async def get_user_profile(user_id: str): + """Get user profile information.""" + user = await get_user_or_404(user_id) # Handles None case + + return { + "name": user.name, + "email": user.email, + "created_at": user.created_at + } +``` + +**Pros**: Reusable pattern, prevents similar bugs, clean API code +**Cons**: Slightly more abstraction + +**Selected**: **Option 3** - Best practice with reusable helper + +## Phase 6: Apply Fix + +```python +# services/user_service.py +from fastapi import HTTPException + +async def get_user_or_404(user_id: str) -> User: + """ + Get user by ID or raise 404 Not Found. + + Args: + user_id: User ID to retrieve + + Returns: + User object + + Raises: + HTTPException: 404 if user not found + + Example: + user = await get_user_or_404("usr_123") + """ + user = await db.users.find_one({"id": user_id}) + if user is None: + raise HTTPException( + status_code=404, + detail=f"User {user_id} not found" + ) + return user + +# api/users.py (FIXED) +@router.get("/users/{user_id}/profile") +async def get_user_profile(user_id: str): + """Get user profile information.""" + user = await get_user_or_404(user_id) # Null-safe! + + return { + "name": user.name, + "email": user.email, + "created_at": user.created_at + } +``` + +## Phase 7: Verify Fix + +### Run Test Again (Should Pass) + +```bash +$ pytest tests/test_user_api.py::test_get_deleted_user_profile -v + +tests/test_user_api.py::test_get_deleted_user_profile PASSED ✅ +``` + +### Run Full Test Suite + +```bash +$ pytest tests/ -v + +======================== 127 passed in 4.23s ======================== +``` + +✅ **All tests pass** - No regressions introduced + +### Manual Verification + +```bash +# Test in staging +$ curl https://api-staging.example.com/users/usr_deleted_123/profile + +HTTP/1.1 404 Not Found +{ + "detail": "User usr_deleted_123 not found" +} +``` + +✅ **Fix verified** - Returns proper 404 instead of 500 + +## Phase 8: Root Cause Analysis (5 Whys) + +**Why 1**: Why did the API return 500 error? +→ Code tried to access `user.name` on None + +**Why 2**: Why was `user` None? +→ Database query returned None (user doesn't exist) + +**Why 3**: Why didn't code handle None case? +→ No null check before accessing user properties + +**Why 4**: Why was there no null check? +→ Developer assumed user would always exist (invalid assumption) + +**Why 5**: Why was invalid assumption made? +→ No code review guideline requiring null checks for database queries + +**ROOT CAUSE**: Missing null-safety pattern for database queries + no enforcement in code review + +## Phase 9: Prevention Strategy + +### Immediate Prevention + +✅ **Unit test added** (prevents this specific bug) + +```python +def test_get_deleted_user_profile(client, db): + # Test ensures 404 is returned for deleted users + pass +``` + +### Short-term Prevention + +✅ **Integration test added** (prevents similar bugs) + +```python +@pytest.mark.parametrize("endpoint", [ + "/users/{id}/profile", + "/users/{id}/settings", + "/users/{id}/posts" +]) +def test_user_endpoints_return_404_for_deleted_users(client, db, endpoint): + """All user endpoints should return 404 for deleted users.""" + user_id = create_and_delete_user(db) + response = client.get(endpoint.format(id=user_id)) + assert response.status_code == 404 +``` + +### Long-term Prevention + +✅ **Architecture change proposed**: Create `get_resource_or_404()` pattern + +```python +# services/base_service.py +from typing import TypeVar, Generic, Type + +T = TypeVar('T') + +class BaseService(Generic[T]): + """Base service with null-safe query methods.""" + + async def get_or_404( + self, + resource_id: str, + resource_type: str = "Resource" + ) -> T: + """Get resource by ID or raise 404.""" + resource = await self.find_one({"id": resource_id}) + if resource is None: + raise HTTPException( + status_code=404, + detail=f"{resource_type} {resource_id} not found" + ) + return resource + +# Usage across all resources +user_service = UserService() +post_service = PostService() +comment_service = CommentService() + +user = await user_service.get_or_404(user_id, "User") +post = await post_service.get_or_404(post_id, "Post") +``` + +### Monitoring Added + +✅ **Alert created** (detects recurrence) + +```yaml +# prometheus/alerts/user_not_found.yml +groups: + - name: user_api + rules: + - alert: HighUserNotFoundRate + expr: | + rate(http_requests_total{ + endpoint="/users/:id/profile", + status_code="404" + }[5m]) > 10 + for: 5m + annotations: + summary: "High rate of user not found errors" + description: "{{ $value }} 404s/sec on user profile endpoint" +``` + +### Documentation Updated + +✅ **Runbook created** + +```markdown +# Runbook: User Not Found Errors + +## Symptom +404 errors when accessing user profiles + +## Diagnosis +- Check if user was recently deleted +- Verify database replication lag +- Check for stale cache entries + +## Resolution +- User deleted: Expected behavior +- Replication lag: Wait 30 seconds +- Stale cache: Clear user cache + +## Prevention +Always use `get_user_or_404()` helper +``` + +## Phase 10: Deploy & Monitor + +### Pre-Deployment Checklist + +- [x] Fix tested in staging +- [x] No performance impact +- [x] Security review not needed (defensive fix) +- [x] Deployment plan created +- [x] Rollback plan ready + +### Deployment + +```bash +# Deploy to staging +$ git push origin feature/fix-user-not-found +$ ./scripts/deploy-staging.sh + +# Verify in staging (1 hour) +$ ./scripts/monitor-staging.sh --duration 1h + +# Deploy to production (gradual rollout) +$ ./scripts/deploy-production.sh --canary 10% # 10% traffic +$ sleep 600 # Monitor for 10 minutes +$ ./scripts/deploy-production.sh --canary 50% # 50% traffic +$ sleep 600 +$ ./scripts/deploy-production.sh --canary 100% # Full traffic +``` + +### Post-Deployment Monitoring + +**1 Hour Post-Deploy**: +```bash +# Check error logs +$ kubectl logs -l app=api --since=1h | grep "User.*not found" +# No unexpected errors ✅ + +# Check error rate +$ curl prometheus/query?query='rate(http_errors_total[1h])' +# No increase in error rate ✅ +``` + +**24 Hours Post-Deploy**: +```bash +# Verify user not found rate is zero +$ curl prometheus/query?query='rate(http_requests_total{status_code="404",endpoint="/users/:id/profile"}[24h])' +# Result: 0 errors ✅ +``` + +## Summary + +| Metric | Value | +|--------|-------| +| **Time to Reproduce** | 5 minutes | +| **Time to Fix** | 15 minutes | +| **Time to Deploy** | 30 minutes | +| **Total Time** | 50 minutes | +| **Tests Added** | 2 (unit + integration) | +| **Prevention Strategies** | 3 (tests, architecture, monitoring) | +| **Recurrences** | 0 (monitored for 1 week) | + +## Lessons Learned + +### What Went Well +1. Clear stack trace made root cause obvious +2. Test-driven debugging caught the issue immediately +3. Helper function prevents similar bugs across codebase + +### What Could Be Improved +1. Should have had null-safety pattern from the start +2. Code review should catch missing null checks +3. Static analysis could detect this pattern + +### Recommendations +1. Add `mypy` or similar for null-safety checking +2. Update code review checklist to include null-safety checks +3. Create linter rule: "Database queries must use `get_or_404` pattern" + +--- + +**Bug Fixed**: ✅ +**Tests Pass**: ✅ +**Prevention Implemented**: ✅ +**Production Stable**: ✅ diff --git a/skills/smart-debugging/examples/performance-bug-debug.md b/skills/smart-debugging/examples/performance-bug-debug.md new file mode 100644 index 0000000..7041abe --- /dev/null +++ b/skills/smart-debugging/examples/performance-bug-debug.md @@ -0,0 +1,92 @@ +# Performance Bug Debug Example + +Debugging slow database queries and N+1 problems. + +## Symptom + +API endpoint taking 4.5 seconds to respond (target: < 200ms). + +## Profiling + +```python +import cProfile +import pstats + +profiler = cProfile.Profile() +profiler.enable() + +response = await get_users_with_posts() + +profiler.disable() +stats = pstats.Stats(profiler) +stats.sort_stats('cumulative') +stats.print_stats(10) +``` + +### Profile Output + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 100 4.321 0.043 4.321 0.043 database.py:42(execute_query) + 1 0.089 0.089 4.410 4.410 users.py:15(get_users_with_posts) +``` + +**Issue**: Database query called 100 times! (N+1 problem) + +## Code Analysis + +```python +# BAD: N+1 Query Problem +async def get_users_with_posts(): + users = await db.users.find_all() # 1 query + + result = [] + for user in users: # 100 iterations + posts = await db.posts.find({"user_id": user.id}) # N queries! + result.append({"user": user, "posts": posts}) + + return result # Total: 101 queries (1 + 100) +``` + +## Fix: Use Join/Eager Loading + +```python +# GOOD: Single Query with Join +async def get_users_with_posts(): + query = """ + SELECT + users.*, + json_agg(posts.*) as posts + FROM users + LEFT JOIN posts ON posts.user_id = users.id + GROUP BY users.id + """ + result = await db.execute(query) # 1 query total! + return result +``` + +## Performance Comparison + +| Approach | Queries | Time | +|----------|---------|------| +| **Before (N+1)** | 101 | 4.5s ❌ | +| **After (Join)** | 1 | 85ms ✅ | + +**Improvement**: 53x faster! + +## Prevention + +1. **Query logging**: Log all database queries in development +2. **Performance tests**: Assert query count < threshold +3. **APM monitoring**: Track query patterns in production (Datadog, New Relic) + +```python +# Performance test +def test_get_users_with_posts_query_count(query_counter): + get_users_with_posts() + assert query_counter.count <= 1, f"Expected 1 query, got {query_counter.count}" +``` + +--- + +**Result**: N+1 detected and fixed. Performance SLA met (< 200ms). diff --git a/skills/smart-debugging/examples/type-error-debug-example.md b/skills/smart-debugging/examples/type-error-debug-example.md new file mode 100644 index 0000000..8231cf3 --- /dev/null +++ b/skills/smart-debugging/examples/type-error-debug-example.md @@ -0,0 +1,126 @@ +# Type Error Debug Example + +Debugging type mismatch errors using systematic analysis and type validation. + +## Error Encountered + +**Environment**: Development +**Severity**: SEV3 (Bug blocking feature development) + +### Error Message + +```python +TypeError: unsupported operand type(s) for +: 'int' and 'str' +``` + +### Context + +Developer implementing new pricing calculation feature receives cryptic type error. + +## Stack Trace Analysis + +```python +Traceback (most recent call last): + File "/app/services/pricing.py", line 45, in calculate_total + total = base_price + discount +TypeError: unsupported operand type(s) for +: 'int' and 'str' +``` + +**Pattern Match**: `type_mismatch` - Incompatible types in operation + +## Code Inspection + +```python +# services/pricing.py +def calculate_total(base_price: int, discount: str) -> int: + """Calculate final price after discount.""" + # Line 45 - THE PROBLEM + total = base_price + discount # int + str = TypeError! + return total +``` + +**Issue**: `discount` parameter typed as `str` but used in numeric operation. + +## Root Cause + +API returns discount as string `"10"` instead of integer `10`. Type hint says `str`, but function logic expects `int`. + +## Fix Options + +### Option 1: Convert String to Int + +```python +def calculate_total(base_price: int, discount: str) -> int: + """Calculate final price after discount.""" + discount_int = int(discount) # Convert string to int + total = base_price - discount_int + return total +``` + +**Issue**: Still accepts `str` - misleading type hint! + +### Option 2: Fix Type Hint (Correct!) + +```python +def calculate_total(base_price: int, discount: int) -> int: + """Calculate final price after discount.""" + total = base_price - discount + return total +``` + +**Better**: Type hint matches expected usage. + +### Option 3: Input Validation with Pydantic + +```python +from pydantic import BaseModel, validator + +class PricingInput(BaseModel): + base_price: int + discount: int + + @validator('discount') + def discount_must_be_positive(cls, v): + if v < 0: + raise ValueError('Discount must be positive') + return v + +def calculate_total(input: PricingInput) -> int: + """Calculate final price after discount.""" + return input.base_price - input.discount +``` + +**Best**: Validates at API boundary, type-safe! + +## Test + +```python +def test_calculate_total_with_valid_types(): + """Test with correct types.""" + result = calculate_total(100, 10) + assert result == 90 + +def test_calculate_total_rejects_string_discount(): + """Test rejects string discount.""" + with pytest.raises(ValidationError): + PricingInput(base_price=100, discount="10") +``` + +## Prevention + +1. **Static type checking**: Run `mypy` in CI/CD +2. **Pydantic validation**: Validate all API inputs +3. **Integration tests**: Test with real API responses + +**Type Safety Enforcement**: +```bash +# mypy config +[mypy] +python_version = 3.11 +strict = True +disallow_untyped_defs = True +``` + +--- + +**Result**: Type error caught at dev time, not production. Type hints + Pydantic prevent recurrence. diff --git a/skills/smart-debugging/reference/INDEX.md b/skills/smart-debugging/reference/INDEX.md new file mode 100644 index 0000000..03fabd8 --- /dev/null +++ b/skills/smart-debugging/reference/INDEX.md @@ -0,0 +1,56 @@ +# Smart Debug Reference + +Debugging references and methodologies for systematic error resolution. + +## Available References + +### [error-patterns-database.md](error-patterns-database.md) +Complete error pattern catalog with fixes. +- **Null Pointer Errors** - NoneType, undefined, null reference +- **Type Errors** - Type mismatch, unsupported operand, conversion failures +- **Index Errors** - Array bounds, list access, slice errors +- **Key Errors** - Dictionary key missing, object property undefined +- **Import Errors** - Module not found, circular imports +- **Database Errors** - Connection refused, timeout, constraint violations +- **API Errors** - 400/422/500 responses, contract violations +- **Concurrency Errors** - Race conditions, deadlocks, async issues +- **Memory Errors** - Out of memory, memory leaks +- **Performance Errors** - Slow queries, N+1 problems, inefficient algorithms + +### [stack-trace-patterns.md](stack-trace-patterns.md) +Stack trace reading and analysis guide. +- Python stack traces (Traceback format) +- JavaScript/TypeScript stack traces (Error.stack format) +- Java stack traces (Exception format) +- Identifying root file vs. propagation +- Filtering stdlib and third-party frames +- Understanding async stack traces +- Reading minified stack traces +- Source map integration + +### [rca-methodology.md](rca-methodology.md) +Root cause analysis methodologies. +- **5 Whys** - Iterative questioning to root cause +- **Timeline Analysis** - Chronological event reconstruction +- **Fishbone Diagram** - Ishikawa cause categorization +- **Fault Tree Analysis** - Logic diagram of failure paths +- **Change Analysis** - Recent deployments and config changes +- **Comparative Analysis** - Working vs. broken environments +- **Reproducibility Testing** - Isolation of causal factors + +### [fix-generation-patterns.md](fix-generation-patterns.md) +Code fix patterns for common errors. +- Null check patterns (guard clauses, optional chaining) +- Type validation patterns (isinstance, type hints) +- Error handling patterns (try-catch, error boundaries) +- Input validation patterns (Pydantic, zod) +- Defensive programming patterns +- Fail-fast vs. graceful degradation +- Error recovery strategies + +## Quick Reference + +**Need error patterns?** → [error-patterns-database.md](error-patterns-database.md) +**Need stack trace help?** → [stack-trace-patterns.md](stack-trace-patterns.md) +**Need RCA methods?** → [rca-methodology.md](rca-methodology.md) +**Need fix patterns?** → [fix-generation-patterns.md](fix-generation-patterns.md) diff --git a/skills/smart-debugging/reference/error-patterns-database.md b/skills/smart-debugging/reference/error-patterns-database.md new file mode 100644 index 0000000..a263bf8 --- /dev/null +++ b/skills/smart-debugging/reference/error-patterns-database.md @@ -0,0 +1,204 @@ +# Error Patterns Database + +Comprehensive catalog of common error patterns with fixes and prevention strategies. + +## Null Pointer / None Type Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **NoneType Attribute** | `'NoneType' object has no attribute 'x'` | Accessing property on None | Add null check: `if obj is None: return` | Use Optional[] types, validation | +| **Undefined Variable** | `undefined is not defined` (JS) | Using variable before assignment | Initialize variable | Use `let`/`const`, enable strict mode | +| **Null Dereference** | `Cannot read property 'x' of null` | Object is null/undefined | Optional chaining: `obj?.property` | Use TypeScript strict null checks | + +### Fix Template + +```python +# Before +user.name # May be None + +# After +if user is None: + return "Unknown" +return user.name + +# Or use get() with default +getattr(user, 'name', 'Unknown') +``` + +## Type Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **Operand Type Mismatch** | `unsupported operand type(s) for +: 'int' and 'str'` | Wrong types in operation | Type conversion or fix type hint | mypy, Pydantic validation | +| **Wrong Argument Type** | `expected str, got int` | Passing wrong type | Convert type or fix signature | Static type checking | +| **JSON Serialization** | `Object of type datetime is not JSON serializable` | Can't serialize type | Custom JSON encoder | Use Pydantic models | + +### Fix Template + +```python +# Type validation with Pydantic +from pydantic import BaseModel + +class UserInput(BaseModel): + age: int # Automatic validation and conversion + +# Input: {"age": "25"} → converts to int(25) +# Input: {"age": "abc"} → ValidationError +``` + +## Index / Key Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **List Index Out of Range** | `list index out of range` | Accessing beyond list length | Check length first | Use `.get()` or try/except | +| **Dict KeyError** | `KeyError: 'missing_key'` | Key doesn't exist in dict | Use `.get()` with default | Pydantic models, TypedDict | +| **Array Out of Bounds** | `undefined` (JS array) | Accessing invalid index | Check array length | Use `?.[]` optional chaining | + +### Fix Template + +```python +# Bad +user_dict['email'] # KeyError if 'email' missing + +# Good +user_dict.get('email', 'no-email@example.com') + +# Best (with Pydantic) +class User(BaseModel): + email: EmailStr # Required, validated +``` + +## Import / Module Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **Module Not Found** | `ModuleNotFoundError: No module named 'x'` | Missing dependency | Install: `pip install x` | Add to requirements.txt | +| **Circular Import** | `ImportError: cannot import name 'X' from partially initialized module` | A imports B, B imports A | Refactor to remove cycle | Dependency injection | +| **Relative Import** | `attempted relative import with no known parent package` | Incorrect relative import | Use absolute imports | Configure PYTHONPATH | + +### Fix Template + +```bash +# Check installed packages +pip list | grep package_name + +# Install missing package +pip install package_name + +# Add to requirements +echo "package_name==1.2.3" >> requirements.txt +``` + +## Database Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **Connection Refused** | `Connection refused` | DB not running or wrong host | Check connection string | Health checks, retry logic | +| **Timeout** | `timeout exceeded` | Query too slow or DB overloaded | Optimize query, add indexes | Query analysis, connection pooling | +| **Unique Constraint** | `UNIQUE constraint failed` | Duplicate key | Handle conflict (upsert) | Pre-check existence | +| **Foreign Key Violation** | `FOREIGN KEY constraint failed` | Referenced record doesn't exist | Validate FK exists first | Use transactions | + +### Fix Template + +```python +# Handle constraint violations +from sqlalchemy.exc import IntegrityError + +try: + db.add(user) + db.commit() +except IntegrityError as e: + db.rollback() + if 'UNIQUE constraint' in str(e): + raise DuplicateUserError() + raise +``` + +## API / HTTP Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **400 Bad Request** | Malformed request | Invalid JSON or missing fields | Validate request schema | Pydantic, OpenAPI validation | +| **401 Unauthorized** | Missing/invalid auth token | Token expired or missing | Refresh token logic | Token rotation, validation | +| **404 Not Found** | Resource doesn't exist | Wrong ID or deleted resource | Return 404 with helpful message | Check existence first | +| **422 Unprocessable** | Validation failed | Data doesn't meet constraints | Fix validation or API call | Schema validation | +| **500 Internal Error** | Server-side error | Unhandled exception | Fix server code, add logging | Error handling, monitoring | + +### Fix Template + +```python +# Proper error handling +from fastapi import HTTPException + +@app.post("/users") +async def create_user(user: UserCreate): + existing = await db.users.find_one({"email": user.email}) + if existing: + raise HTTPException( + status_code=409, # Conflict + detail="User with this email already exists" + ) + return await db.users.insert_one(user) +``` + +## Concurrency / Race Condition Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **Race Condition** | Inconsistent results, data corruption | Multiple threads accessing shared state | Use locks, atomic operations | Immutable data, message queues | +| **Deadlock** | System hangs | Circular wait for resources | Order lock acquisition consistently | Avoid nested locks | +| **Lost Update** | Changes overwritten | Concurrent updates | Optimistic locking (version field) | Transactions, SELECT FOR UPDATE | + +### Fix Template + +```python +# Use distributed lock (Redis) +import redis_lock + +with redis_lock.Lock(redis_client, "order:123"): + order = db.orders.get(123) + order.status = "processed" + db.orders.save(order) +``` + +## Memory / Performance Errors + +| Pattern | Indicators | Root Cause | Fix | Prevention | +|---------|-----------|------------|-----|------------| +| **Out of Memory** | `MemoryError` | Loading too much data | Stream/paginate data | Lazy loading, generators | +| **N+1 Query Problem** | Slow performance | Loop with query inside | Use JOIN or eager loading | Query analysis, APM tools | +| **Memory Leak** | Memory grows over time | Objects not garbage collected | Fix circular references | Profiling, weak references | + +### Fix Template + +```python +# Bad: N+1 queries +for user in users: # 1 query + posts = db.posts.find(user_id=user.id) # N queries + +# Good: Single query with join +users_with_posts = db.execute(""" + SELECT users.*, json_agg(posts.*) as posts + FROM users + LEFT JOIN posts ON posts.user_id = users.id + GROUP BY users.id +""") +``` + +## Quick Reference: Error → Pattern + +| Error Message | Pattern | Fix Priority | +|---------------|---------|--------------| +| `'NoneType' object has no attribute` | null_pointer | High | +| `unsupported operand type` | type_mismatch | Medium | +| `list index out of range` | index_error | Medium | +| `KeyError` | key_error | Medium | +| `ModuleNotFoundError` | import_error | High | +| `Connection refused` | db_connection | High | +| `UNIQUE constraint failed` | db_constraint | Medium | +| `401 Unauthorized` | api_auth | High | +| `MemoryError` | memory_error | Critical | + +--- + +**Usage**: When debugging, match error message to pattern, apply fix template, implement prevention strategy. diff --git a/skills/smart-debugging/reference/fix-generation-patterns.md b/skills/smart-debugging/reference/fix-generation-patterns.md new file mode 100644 index 0000000..2cc2188 --- /dev/null +++ b/skills/smart-debugging/reference/fix-generation-patterns.md @@ -0,0 +1,483 @@ +# Fix Generation Patterns + +Comprehensive guide to generating, evaluating, and implementing fixes for software bugs. + +## Multiple Fix Options Strategy + +**Core Principle**: Always generate 2-3 fix options with trade-off analysis. + +### Fix Option Template + +```markdown +**Option 1: [Name]** (e.g., Quick Fix) +**Implementation**: [What to change] +**Pros**: [Benefits] +**Cons**: [Drawbacks] +**Effort**: [Time estimate] +**Risk**: [Low/Medium/High] + +**Option 2: [Name]** (e.g., Proper Fix) +**Implementation**: [What to change] +**Pros**: [Benefits] +**Cons**: [Drawbacks] +**Effort**: [Time estimate] +**Risk**: [Low/Medium/High] + +**Option 3: [Name]** (e.g., Comprehensive Fix) +**Implementation**: [What to change] +**Pros**: [Benefits] +**Cons**: [Drawbacks] +**Effort**: [Time estimate] +**Risk**: [Low/Medium/High] + +**Recommendation**: Option [X] because [reasoning] +``` + +_See [null-pointer-debug-example.md](../examples/null-pointer-debug-example.md) for complete fix options example._ + +## Quick Fix vs. Proper Fix + +### Decision Matrix + +| Criteria | Quick Fix | Proper Fix | +|----------|-----------|------------| +| **Urgency** | Production down, immediate relief needed | Incident resolved, addressing root cause | +| **Scope** | Minimal changes, single file | Multiple files, architectural changes | +| **Time** | Minutes to hours | Hours to days | +| **Testing** | Manual verification | Full test coverage required | +| **Risk** | Low (minimal changes) | Medium (broader impact) | +| **Longevity** | Temporary patch | Permanent solution | + +### When to Use Quick Fix + +✅ **Production incident** - System is down, users impacted +✅ **Known workaround** - Clear, safe mitigation exists +✅ **Low risk** - Change is isolated and reversible +✅ **Follow-up planned** - Proper fix scheduled for next sprint + +**Pattern**: Quick fix now → Monitor → Proper fix later + +### When to Use Proper Fix + +✅ **Root cause addressed** - Not just treating symptoms +✅ **Proper testing** - Comprehensive test coverage added +✅ **Type safety** - Leverages static type checking +✅ **Prevention** - Prevents entire class of similar bugs +✅ **Documentation** - Code is self-documenting + +**Pattern**: Understand root cause → Comprehensive fix → Prevent recurrence + +## Fix Priority Assessment + +### Priority Matrix + +| Severity | Frequency | Priority | Response Time | +|----------|-----------|----------|---------------| +| **Critical** | High | P0 | Immediate (< 1 hour) | +| **Critical** | Low | P1 | Same day | +| **Major** | High | P1 | Same day | +| **Major** | Low | P2 | This week | +| **Minor** | High | P2 | This week | +| **Minor** | Low | P3 | Next sprint | + +**Severity Criteria**: +- **Critical**: Data loss, security breach, production down +- **Major**: Degraded performance, incorrect results, feature broken +- **Minor**: Edge case, cosmetic issue, rare error + +**Frequency Criteria**: +- **High**: Affects >10% of users or happens >10 times/day +- **Low**: Affects <1% of users or happens occasionally + +## Common Fix Patterns by Error Type + +### Null/Undefined Errors + +**Pattern 1: Null Check with Default** +```python +# Before +name = user.name # NoneType error + +# After +name = user.name if user else "Unknown" +``` + +**Pattern 2: Raise Exception** (API boundaries) +```python +# Before +user = db.users.find_one(user_id) +return user.name # NoneType error + +# After +user = db.users.find_one(user_id) +if user is None: + raise HTTPException(404, "User not found") +return user.name +``` + +### Type Errors + +**Pattern 1: Type Conversion with Validation** +```python +# Before +total = base_price + discount # TypeError: int + str + +# After +from pydantic import BaseModel + +class PriceInput(BaseModel): + base_price: int + discount: int # Automatic validation and conversion + +input_data = PriceInput(**request_body) # Validates types +total = input_data.base_price + input_data.discount +``` + +### Database Errors + +**Pattern 1: Constraint Violations** +```python +# Before +db.add(user) +db.commit() # IntegrityError: UNIQUE constraint failed + +# After +from sqlalchemy.exc import IntegrityError + +try: + db.add(user) + db.commit() +except IntegrityError: + db.rollback() + # Option A: Return error + raise HTTPException(409, "User with this email already exists") + # Option B: Upsert + existing = db.query(User).filter_by(email=user.email).first() + if existing: + existing.name = user.name + db.commit() +``` + +**Pattern 2: Connection Failures** +```python +# Before +engine = create_engine(DATABASE_URL) +connection = engine.connect() # OperationalError: connection refused + +# After +from tenacity import retry, stop_after_attempt, wait_exponential + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10) +) +def get_connection(): + engine = create_engine(DATABASE_URL) + return engine.connect() + +connection = get_connection() +``` + +### API/Integration Errors + +**Pattern 1: Validation at Boundary** +```python +# Before +response = payment_api.create_charge(amount=order.total) +# Fails with 422 if amount < 50 (API minimum) + +# After +class CreateChargeRequest(BaseModel): + amount: int + + @validator('amount') + def amount_meets_minimum(cls, v): + if v < 50: + raise ValueError('Amount must be at least $0.50') + return v + +# Validate before API call +request = CreateChargeRequest(amount=order.total) # Fails early +response = payment_api.create_charge(**request.dict()) +``` + +**Pattern 2: Retry with Backoff** +```python +# Before +response = httpx.get(api_url) # Timeout occasionally + +# After +from tenacity import retry, retry_if_exception_type, stop_after_attempt + +@retry( + retry=retry_if_exception_type(httpx.TimeoutException), + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10) +) +async def fetch_with_retry(url: str): + async with httpx.AsyncClient(timeout=5.0) as client: + return await client.get(url) +``` + +### Performance Errors + +**Pattern 1: N+1 Query Fix** +```python +# Before (N+1 queries) +users = db.query(User).all() # 1 query +for user in users: + posts = db.query(Post).filter(Post.user_id == user.id).all() # N queries + +# After (Single query with join) +users = db.query(User).options( + joinedload(User.posts) +).all() # 1 query with join +``` + +**Pattern 2: Caching** +```python +# Before +def get_user_profile(user_id: str): + return db.query(User).filter_by(id=user_id).first() # Every time + +# After +from functools import lru_cache +from cachetools import TTLCache, cached + +cache = TTLCache(maxsize=1000, ttl=300) # 5 minute TTL + +@cached(cache) +def get_user_profile(user_id: str): + return db.query(User).filter_by(id=user_id).first() +``` + +## Fix Validation Strategies + +### Validation Checklist + +```markdown +Before deploying fix: +- [ ] Fix addresses root cause (not just symptoms) +- [ ] Tests added to prevent recurrence +- [ ] Tests pass locally +- [ ] Code reviewed by peer +- [ ] No new linting/type errors +- [ ] Performance impact assessed +- [ ] Security implications reviewed +- [ ] Rollback plan documented +- [ ] Monitoring/alerts updated +``` + +### Test-Driven Fix Approach + +**Pattern**: Write failing test → Implement fix → Test passes + +```python +# Step 1: Write failing test +def test_get_user_with_invalid_id_returns_404(): + """Test that invalid user_id returns 404, not 500.""" + response = client.get("/users/invalid-id") + assert response.status_code == 404 + assert "User not found" in response.json()["detail"] + +# Step 2: Run test (should fail with current bug) +# pytest tests/test_users.py::test_get_user_with_invalid_id_returns_404 +# AssertionError: 500 != 404 + +# Step 3: Implement fix +@app.get("/users/{user_id}") +async def get_user(user_id: str): + user = await db.users.find_one({"id": user_id}) + if user is None: + raise HTTPException(404, "User not found") + return user + +# Step 4: Run test (should pass) +# pytest tests/test_users.py::test_get_user_with_invalid_id_returns_404 +# PASSED +``` + +### Integration Testing + +```python +# Test fix with realistic scenario +@pytest.mark.integration +async def test_order_creation_with_negative_total(): + """Integration test: Ensure negative order total is rejected.""" + # Setup + user = await create_test_user() + + # Attempt to create order with negative total + response = await client.post("/orders", json={ + "user_id": user.id, + "items": [], + "total": -100 # Invalid + }) + + # Assert validation error + assert response.status_code == 422 + assert "total must be positive" in response.json()["detail"] + + # Verify no order created in database + orders = await db.orders.find({"user_id": user.id}) + assert len(orders) == 0 +``` + +## Refactoring Considerations + +### When to Refactor During Fix + +**Refactor if**: +✅ Fix requires understanding convoluted code +✅ Code duplication prevents proper fix +✅ Poor structure makes fix risky +✅ Fix is part of larger architectural improvement + +**Don't refactor if**: +❌ Production incident needs immediate fix +❌ Refactoring scope unclear +❌ Tests insufficient to ensure safety +❌ Refactoring can be done separately + +### Refactoring Patterns + +**Pattern 1: Extract Function** +```python +# Before (hard to fix null error) +def process_order(order_data): + user = db.users.find_one(order_data["user_id"]) + if user.is_active and user.credits > 0: + # 50 lines of order processing + pass + +# After (easier to add null check) +def process_order(order_data): + user = get_validated_user(order_data["user_id"]) + process_order_for_user(user, order_data) + +def get_validated_user(user_id: str) -> User: + """Get user and validate they can place orders.""" + user = db.users.find_one(user_id) + if user is None: + raise HTTPException(404, "User not found") + if not user.is_active: + raise HTTPException(403, "User account inactive") + if user.credits <= 0: + raise HTTPException(402, "Insufficient credits") + return user +``` + +## Production Safety + +### Pre-Deployment Checklist + +```markdown +- [ ] Fix tested in staging environment +- [ ] Performance impact measured (CPU, memory, latency) +- [ ] Database migrations tested with production-sized data +- [ ] Feature flag available for gradual rollout +- [ ] Rollback procedure documented and tested +- [ ] Monitoring dashboard shows relevant metrics +- [ ] Alerts configured for fix-related failures +- [ ] On-call engineer briefed on deployment +- [ ] Communication sent to stakeholders +``` + +### Gradual Rollout Pattern + +```python +# Use feature flag for gradual rollout +from launchdarkly import LDClient + +ld_client = LDClient("sdk-key") + +@app.get("/users/{user_id}") +async def get_user(user_id: str): + use_new_validation = ld_client.variation( + "new-user-validation", + {"key": user_id}, + default=False + ) + + if use_new_validation: + # New fix with validation + user = await get_validated_user(user_id) + else: + # Old code (fallback) + user = await db.users.find_one(user_id) + + return user +``` + +## Rollback Planning + +### Rollback Decision Criteria + +**Rollback immediately if**: +- Error rate spikes >5% above baseline +- Critical functionality broken +- Data corruption detected +- Performance degrades >50% +- Security vulnerability introduced + +**Monitor and investigate if**: +- Error rate increases <5% +- Non-critical functionality affected +- Performance degrades <20% +- Edge cases failing + +### Rollback Procedures + +**1. Application Code Rollback** +```bash +# Git-based rollback +git revert +git push origin main + +# Or redeploy previous version +git checkout +./deploy.sh +``` + +**2. Database Migration Rollback** +```bash +# Alembic (Python) +alembic downgrade -1 + +# Drizzle (TypeScript) +bun run drizzle-kit drop --migration +``` + +**3. Feature Flag Disable** +```python +# Instantly disable via LaunchDarkly dashboard or API +ld_client.variation("new-user-validation", context, default=False) +``` + +**4. Cache Invalidation** +```python +# Clear cache after rollback +redis_client.flushdb() # Clear all cache +# Or selectively +redis_client.delete("user:*") # Clear user cache only +``` + +## Quick Reference + +| Error Type | Primary Fix Pattern | Testing Strategy | +|------------|-------------------|------------------| +| **Null/Undefined** | Null check, optional chaining, raise exception | Unit test with None input | +| **Type Mismatch** | Pydantic validation, type guards | Unit test with wrong types | +| **Database** | Try/except with rollback, retries | Integration test with DB | +| **API/Integration** | Validation at boundary, retries | Mock API responses | +| **Performance** | Caching, query optimization | Performance benchmark test | + +| Fix Type | When to Use | Risk Level | +|----------|-------------|------------| +| **Quick Fix** | Production incident | Low (isolated change) | +| **Proper Fix** | Root cause resolution | Medium (broader changes) | +| **Comprehensive Fix** | Prevention of entire class | Medium-High (architectural) | + +--- + +**Usage**: When implementing fix, generate 2-3 options with trade-offs, select best option based on priority, validate with tests, deploy with gradual rollout, monitor closely, document rollback procedure. diff --git a/skills/smart-debugging/reference/rca-methodology.md b/skills/smart-debugging/reference/rca-methodology.md new file mode 100644 index 0000000..3acdbbc --- /dev/null +++ b/skills/smart-debugging/reference/rca-methodology.md @@ -0,0 +1,466 @@ +# Root Cause Analysis (RCA) Methodology + +Comprehensive guide to performing effective root cause analysis for software bugs and incidents. + +## What is Root Cause Analysis? + +**Definition**: Systematic process of identifying the fundamental reason why a problem occurred, not just treating symptoms. + +**Goal**: Find the root cause(s) to implement prevention strategies that stop recurrence. + +**Key Principle**: Distinguish between: +- **Symptom**: Observable error (e.g., "API returns 500 error") +- **Proximate Cause**: Immediate trigger (e.g., "Database query timeout") +- **Root Cause**: Fundamental reason (e.g., "Missing database index on frequently queried column") + +## The 5 Whys Technique + +### Overview + +**Method**: Ask "Why?" five times (or more) to drill down from symptom to root cause. + +**Origin**: Toyota Production System (Lean Manufacturing) + +**Best For**: Sequential cause-effect chains + +### Example: Null Pointer Error + +``` +Problem: API endpoint returns 500 error + +Why? → User object is null when accessing .name property +Why? → Database query returned null instead of user +Why? → User ID doesn't exist in database +Why? → Frontend sent incorrect user ID from stale cache +Why? → Cache invalidation not triggered after user deletion +ROOT CAUSE: Missing cache invalidation on user deletion +``` + +### Rules for Effective 5 Whys + +1. **Be Specific**: "Database slow" → "Query takes 4.5s (target: <200ms)" +2. **Use Data**: Support each "Why" with evidence (logs, metrics, traces) +3. **Don't Stop Too Early**: Keep asking until you reach a process/policy root cause +4. **Don't Blame People**: Focus on processes, not individuals +5. **May Need More/Fewer Than 5**: Stop when you reach actionable root cause + +### Template + +```markdown +**Problem Statement**: [Observable symptom with impact] + +**Why #1**: [First level cause] +**Evidence**: [Logs, metrics, traces] + +**Why #2**: [Deeper cause] +**Evidence**: [Supporting data] + +**Why #3**: [Even deeper] +**Evidence**: [Supporting data] + +**Why #4**: [Near root cause] +**Evidence**: [Supporting data] + +**Why #5**: [Root cause] +**Evidence**: [Supporting data] + +**Root Cause**: [Fundamental reason] +**Prevention**: [How to prevent recurrence] +``` + +## Fishbone (Ishikawa) Diagram + +### Overview + +**Method**: Visual diagram categorizing potential causes into major categories. + +**Best For**: Complex problems with multiple contributing factors + +**Categories (Software Context)**: +- **Code**: Logic errors, missing validation, edge cases +- **Data**: Invalid inputs, corrupt data, missing records +- **Infrastructure**: Server issues, network problems, resource limits +- **Dependencies**: Third-party APIs, libraries, services +- **Process**: Deployment issues, configuration errors, environment mismatches +- **People**: Knowledge gaps, communication failures, assumptions + +### Example Structure + +``` + Code + | + Missing validation + / + / +API 500 ─────────────── Infrastructure +Error \ + \ + Database timeout + | + Data +``` + +### When to Use + +- Multiple potential root causes +- Need stakeholder alignment on cause +- Complex systems with many dependencies +- Post-incident reviews with team + +## Timeline Analysis + +### Overview + +**Method**: Create chronological sequence of events leading to incident. + +**Best For**: Understanding cascading failures, race conditions, timing issues + +### Example + +```markdown +## Timeline: User Profile Page Crash + +**T-00:05** - User updates profile information +**T-00:03** - Profile update succeeds, cache invalidation triggered +**T-00:02** - Cache clear initiated but takes 3s (network latency) +**T-00:00** - User refreshes page, cache still has old data +**T+00:01** - API fetches user from cache (stale) +**T+00:02** - Frontend renders with field that was deleted in update +**T+00:03** - JavaScript error: Cannot read property 'X' of undefined +**T+00:04** - Error boundary catches error, shows crash page + +**Root Cause**: Cache invalidation is async and completes after page reload, causing stale data rendering. +``` + +### Components + +| Element | Description | Example | +|---------|-------------|---------| +| **Timestamp** | Relative or absolute time | `T-00:05` or `14:32:15 UTC` | +| **Event** | What happened | `User clicked submit` | +| **System State** | Relevant state at time | `Cache: stale, DB: updated` | +| **Decision Point** | Branch in event chain | `If cache miss: fetch DB` | + +## Distinguishing Root Cause from Symptoms + +### Symptom vs. Root Cause + +| Symptom | Root Cause | +|---------|------------| +| API returns 500 error | Missing error handling for null user | +| Database query slow | Missing index on `user_id` column | +| Memory leak in production | Circular reference in event listeners | +| User can't login | Session cookie expires after 5 minutes (should be 24h) | + +### Test: The Prevention Question + +**Ask**: "If I fix this, will the problem never happen again?" + +**If Yes** → Likely root cause +**If No** → Still a symptom or contributing factor + +**Example**: +- "Add null check" → Prevents this specific null error, but why was data null? (Symptom fix) +- "Add database foreign key constraint" → Prevents any invalid user_id from being stored (Root cause fix) + +## Contributing Factors vs. Root Cause + +### Multiple Contributing Factors + +Complex incidents often have multiple contributing factors and one primary root cause. + +**Example: Data Loss Incident** + +```markdown +**Primary Root Cause**: Database backup script fails silently (no monitoring) + +**Contributing Factors**: +1. No backup validation process +2. Backup monitoring disabled in production +3. Backup script lacks error logging +4. No runbook for backup verification +5. Manual backup never tested + +**Analysis**: All factors contributed, but root cause is silent failure. Fix that first. +``` + +### Prioritization + +| Priority | Factor Type | Action | +|----------|-------------|--------| +| **P0** | Root cause | Fix immediately | +| **P1** | Major contributor | Fix in same release | +| **P2** | Minor contributor | Fix in next sprint | +| **P3** | Edge case | Backlog | + +## RCA Documentation Format + +### Standard Structure + +```markdown +# Root Cause Analysis: [Title] + +**Date**: YYYY-MM-DD +**Incident ID**: INC-12345 +**Severity**: [SEV1/SEV2/SEV3] +**Participants**: [Names of people involved in RCA] + +## Summary + +[2-3 sentence overview of incident and root cause] + +## Impact + +- **Users Affected**: [Number or %] +- **Duration**: [Time from start to resolution] +- **Business Impact**: [Revenue loss, SLA breach, etc.] + +## Timeline + +[Chronological sequence of events] + +## Root Cause + +[Detailed explanation of fundamental cause] + +### 5 Whys Analysis + +[Step-by-step "Why?" chain] + +## Contributing Factors + +[List of factors that enabled or worsened the incident] + +## Prevention + +### Immediate Actions (Within 24h) +- [ ] Action 1 +- [ ] Action 2 + +### Short-term Actions (Within 1 week) +- [ ] Action 1 +- [ ] Action 2 + +### Long-term Actions (Within 1 month) +- [ ] Action 1 +- [ ] Action 2 + +## Lessons Learned + +[Key takeaways and process improvements] +``` + +## Prevention Strategy Development + +### Fix Categories + +| Category | Description | Example | +|----------|-------------|---------| +| **Technical** | Code, config, infrastructure changes | Add database index, implement retry logic | +| **Process** | Changes to how work is done | Require code review for DB changes | +| **Monitoring** | Detect issues before they cause incidents | Alert on slow query thresholds | +| **Testing** | Catch issues before production | Add integration test for edge case | +| **Documentation** | Improve knowledge sharing | Document backup restoration procedure | + +### Prevention Checklist + +```markdown +**Can we prevent the root cause?** +- [ ] Technical fix implemented +- [ ] Tests added to catch recurrence +- [ ] Monitoring added to detect early + +**Can we detect it faster?** +- [ ] Alerts configured +- [ ] Logging improved +- [ ] Dashboards updated + +**Can we mitigate impact?** +- [ ] Graceful degradation added +- [ ] Circuit breaker implemented +- [ ] Fallback logic added + +**Can we recover faster?** +- [ ] Runbook created +- [ ] Automation added +- [ ] Team trained + +**Can we prevent similar issues?** +- [ ] Pattern identified +- [ ] Linting rule added +- [ ] Architecture review scheduled +``` + +## Common RCA Pitfalls + +### Pitfall 1: Stopping Too Early + +**Bad**: +``` +Why? → User got 500 error +Root Cause: Server returned error + +Prevention: Fix server error +``` + +**Good**: +``` +Why? → User got 500 error +Why? → Server threw unhandled exception +Why? → Null pointer accessing user.email +Why? → User object was null +Why? → Database returned no user +Why? → User ID didn't exist +Why? → Frontend sent deleted user's ID +Why? → Frontend cache not invalidated after deletion +Root Cause: Missing cache invalidation on user deletion + +Prevention: Trigger cache clear on user deletion, add cache TTL as safety +``` + +### Pitfall 2: Blaming People + +**Bad**: +``` +Root Cause: Developer forgot to add validation +Prevention: Tell developer to remember next time +``` + +**Good**: +``` +Root Cause: No validation enforced at API boundary +Prevention: +- Use Pydantic for automatic validation +- Add linting rule to detect missing validation +- Update code review checklist +``` + +### Pitfall 3: Accepting "Human Error" as Root Cause + +**Bad**: +``` +Root Cause: Admin accidentally deleted production database +Prevention: Be more careful +``` + +**Good**: +``` +Root Cause: Production database lacks deletion protection +Prevention: +- Enable RDS deletion protection +- Require MFA for production access +- Implement soft-delete instead of hard-delete +- Add "Are you sure?" confirmation with typed confirmation +``` + +### Pitfall 4: Multiple Root Causes Without Prioritization + +**Bad**: +``` +Root Causes: +1. Missing error handling +2. No monitoring +3. Bad documentation +4. Insufficient testing +5. Poor communication + +Prevention: Fix all of them +``` + +**Good**: +``` +Primary Root Cause: Missing error handling (caused immediate incident) + +Contributing Factors: +- No monitoring (delayed detection) +- Insufficient testing (didn't catch before deployment) + +Prevention Priority: +1. Add error handling (prevents recurrence) - P0 +2. Add monitoring (faster detection) - P1 +3. Add tests (catch in CI) - P1 +4. Improve docs (better response) - P2 +``` + +### Pitfall 5: Technical Fix Without Process Improvement + +**Bad**: +``` +Root Cause: Missing database index +Prevention: Add index +``` + +**Good**: +``` +Root Cause: Missing database index causing slow queries +Prevention: +- Technical: Add index on user_id column +- Process: Require query performance review in code review +- Monitoring: Alert on queries >200ms +- Testing: Add performance test asserting query count +``` + +## RCA Review and Validation + +### Review Checklist + +```markdown +- [ ] Root cause clearly identified and evidence-based +- [ ] Timeline accurate and complete +- [ ] All contributing factors documented +- [ ] Prevention strategies are actionable +- [ ] Prevention strategies assigned owners and due dates +- [ ] Lessons learned documented +- [ ] Incident review meeting scheduled +- [ ] RCA shared with relevant teams +``` + +### Validation Questions + +1. **Completeness**: Does the RCA explain all observed symptoms? +2. **Preventability**: Will the proposed fixes prevent recurrence? +3. **Testability**: Can we verify the fixes work? +4. **Generalizability**: Are there similar issues we should address? +5. **Sustainability**: Will fixes remain effective long-term? + +## Best Practices + +### Do's + +✅ **Start RCA immediately** after incident resolution +✅ **Involve multiple people** for diverse perspectives +✅ **Use data** to support each "Why" answer +✅ **Focus on processes**, not people +✅ **Document everything** even if it seems obvious +✅ **Assign owners** to all prevention actions +✅ **Set deadlines** for prevention implementation +✅ **Follow up** to ensure actions completed + +### Don'ts + +❌ **Don't rush** - Thorough RCA takes time +❌ **Don't blame** - Focus on systemic issues +❌ **Don't accept vague answers** - "System was slow" → "Query took 4.5s" +❌ **Don't stop at technical fixes** - Address process and monitoring too +❌ **Don't skip documentation** - Future incidents benefit from past RCAs +❌ **Don't forget to close the loop** - Verify prevention actions worked + +## Quick Reference + +| Technique | Best For | Output | +|-----------|----------|--------| +| **5 Whys** | Sequential cause-effect chains | Linear cause chain → root cause | +| **Fishbone** | Multiple potential causes | Categorized causes diagram | +| **Timeline** | Cascading failures, timing issues | Chronological event sequence | + +| Root Cause Type | Fix Strategy | +|-----------------|--------------| +| **Missing validation** | Add validation at boundary + tests | +| **Missing error handling** | Add try/catch + logging + monitoring | +| **Performance issue** | Optimize + add performance test + alert | +| **Configuration error** | Fix config + add validation + documentation | +| **Process gap** | Update process + add checklist + training | + +--- + +**Usage**: When debugging is complete, perform RCA to understand why the bug existed and how to prevent similar issues. Use 5 Whys for most cases, Fishbone for complex multi-factor incidents, Timeline for cascading failures. diff --git a/skills/smart-debugging/reference/stack-trace-patterns.md b/skills/smart-debugging/reference/stack-trace-patterns.md new file mode 100644 index 0000000..e09a0b5 --- /dev/null +++ b/skills/smart-debugging/reference/stack-trace-patterns.md @@ -0,0 +1,414 @@ +# Stack Trace Patterns + +Comprehensive guide to reading, analyzing, and extracting insights from stack traces across different languages and environments. + +## Python Stack Traces + +### Anatomy + +```python +Traceback (most recent call last): + File "/app/api/users.py", line 45, in get_user + user = db.query(User).filter(User.id == user_id).one() + File "/venv/lib/sqlalchemy/orm/query.py", line 2890, in one + raise NoResultFound("No row was found") +sqlalchemy.orm.exc.NoResultFound: No row was found +``` + +**Reading Order**: Bottom-up (exception → root cause) + +| Component | Description | Example | +|-----------|-------------|---------| +| **Exception Type** | The error class | `sqlalchemy.orm.exc.NoResultFound` | +| **Exception Message** | Error description | `"No row was found"` | +| **Root Frame** | Where error originated | `query.py:2890, in one` | +| **Call Stack** | Function call chain | `users.py:45 → query.py:2890` | + +### Identifying the Root Cause + +```python +# Stack trace from user code → library code +Traceback (most recent call last): + File "/app/api/orders.py", line 23, in create_order # ← Your code (start here!) + payment = process_payment(order.total) + File "/app/services/payment.py", line 67, in process_payment + stripe.charge.create(amount=amount) + File "/venv/lib/stripe/api.py", line 342, in create # ← Library code (ignore) + raise InvalidRequestError("Amount must be positive") +stripe.error.InvalidRequestError: Amount must be positive +``` + +**Analysis**: +1. Exception: `InvalidRequestError` - Amount validation failed +2. Root frame in your code: `payment.py:67` - Calling Stripe with invalid amount +3. Source of bad data: `orders.py:23` - Passing `order.total` (likely 0 or negative) + +**Fix Location**: Check `order.total` validation in `orders.py:23` + +### Filtering Noise + +**Focus on**: +- Files in your project directory (`/app/*`) +- First occurrence of error in your code + +**Ignore**: +- Virtual environment files (`/venv/*`, `site-packages/*`) +- Standard library (`/usr/lib/python3.*/`) +- Framework internals (unless debugging framework) + +### Async Stack Traces + +```python +Traceback (most recent call last): + File "/app/api/users.py", line 23, in get_user_profile + user = await fetch_user(user_id) + File "/app/services/users.py", line 45, in fetch_user + data = await http_client.get(f"/users/{user_id}") + File "/venv/lib/httpx/_client.py", line 1234, in get + raise ConnectTimeout() +httpx.ConnectTimeout: Connection timed out +``` + +**Key Indicators**: +- `await` in frame descriptions +- Async function names (`async def`) +- Coroutine references + +**Analysis**: Trace async call chain: `get_user_profile` → `fetch_user` → `httpx.get` → timeout + +## JavaScript/TypeScript Stack Traces + +### Node.js Format + +```javascript +Error: User not found + at UserService.findById (/app/services/user.service.ts:42:11) + at async getUserProfile (/app/api/users.controller.ts:23:18) + at async /app/middleware/auth.ts:67:5 + at async handleRequest (/app/server/request-handler.ts:15:3) +``` + +**Reading Order**: Top-down (error → call chain) + +| Component | Description | Example | +|-----------|-------------|---------| +| **Error Type** | Error class | `Error` | +| **Error Message** | Description | `"User not found"` | +| **Root Frame** | Where thrown | `user.service.ts:42:11` | +| **Call Stack** | Caller chain | `getUserProfile` → middleware → request handler | + +### Browser Stack Traces + +```javascript +Uncaught TypeError: Cannot read property 'name' of undefined + at UserProfile.render (UserProfile.tsx:15:32) + at finishClassComponent (react-dom.production.min.js:123:45) + at updateClassComponent (react-dom.production.min.js:456:12) +``` + +**Analysis**: +- Error: Accessing `.name` on undefined object +- Root: `UserProfile.tsx:15:32` (your component) +- Framework: React rendering internals (ignore) + +**Fix**: Add null check in `UserProfile.tsx:15` + +### Minified Stack Traces + +```javascript +TypeError: Cannot read property 'name' of undefined + at t.render (main.a3b4c5d6.js:1:23456) + at u (2.chunk.js:4:567) +``` + +**Problem**: Minified code is unreadable (`t`, `u`, cryptic filenames) + +**Solution**: Use source maps + +```javascript +// With source map +TypeError: Cannot read property 'name' of undefined + at UserProfile.render (src/components/UserProfile.tsx:15:32) + at ReactComponent.update (src/lib/react.ts:45:12) +``` + +**How**: Ensure source maps are available: +- Development: Always enabled +- Production: Enable for debugging (`.map` files) +- Error tracking: Sentry, Bugsnag auto-apply source maps + +## Java Stack Traces + +### Format + +```java +java.lang.NullPointerException: Cannot invoke "User.getName()" because "user" is null + at com.example.UserService.getFullName(UserService.java:42) + at com.example.UserController.getUserProfile(UserController.java:23) + at org.springframework.web.method.support.InvocableHandlerMethod.invoke(InvocableHandlerMethod.java:219) + at org.springframework.web.servlet.mvc.method.annotation.ServletInvocableHandlerMethod.invokeAndHandle(ServletInvocableHandlerMethod.java:142) +``` + +**Reading Order**: Top-down + +**Analysis**: +- Exception: `NullPointerException` with helpful message (Java 14+) +- Root: `UserService.java:42` calling `.getName()` on null +- Caller: `UserController.java:23` +- Framework: Spring MVC (ignore) + +**Fix**: Add null check at `UserService.java:42` + +## FastAPI/Pydantic Stack Traces + +### Validation Error + +```python +pydantic.error_wrappers.ValidationError: 2 validation errors for UserCreate +email + field required (type=value_error.missing) +age + ensure this value is greater than 0 (type=value_error.number.not_gt; limit_value=0) +``` + +**Analysis**: +- Not a traditional stack trace - validation error report +- Lists all validation failures +- Each error shows: field, message, type + +**Fix**: Client must send valid `email` (required) and `age > 0` + +### FastAPI Exception + +```python +Traceback (most recent call last): + File "/app/api/endpoints/users.py", line 45, in create_user + db_user = await crud.user.create(user_in) + File "/app/crud/user.py", line 23, in create + db.add(db_obj) + File "/venv/lib/sqlalchemy/orm/session.py", line 2345, in add + raise IntegrityError("UNIQUE constraint failed: users.email") +sqlalchemy.exc.IntegrityError: (sqlite3.IntegrityError) UNIQUE constraint failed: users.email +``` + +**Analysis**: +- Database constraint violation +- Root in your code: `crud/user.py:23` trying to insert duplicate email +- Caller: `api/endpoints/users.py:45` + +**Fix**: Check if user exists before insert, or use upsert + +## Cloudflare Workers Stack Traces + +### Format + +```javascript +Error: Failed to fetch user data + at fetchUser (worker.js:45:11) + at handleRequest (worker.js:23:18) +``` + +**Characteristics**: +- Minimal stack (no Node.js internals) +- V8 isolate execution context +- Limited to worker code only + +**Edge Cases**: +```javascript +Uncaught (in promise) TypeError: response.json is not a function +``` +- Common: Missing `await` on fetch response +- Fix: `await response.json()` instead of `response.json()` + +## Pattern Recognition + +### Null/Undefined Access Patterns + +**Python**: +``` +AttributeError: 'NoneType' object has no attribute 'X' +TypeError: 'NoneType' object is not subscriptable +``` + +**JavaScript**: +``` +TypeError: Cannot read property 'X' of null +TypeError: Cannot read property 'X' of undefined +``` + +**Java**: +``` +java.lang.NullPointerException: Cannot invoke "X" because "Y" is null +``` + +### Type Mismatch Patterns + +**Python**: +``` +TypeError: unsupported operand type(s) for +: 'int' and 'str' +TypeError: 'X' object is not callable +``` + +**JavaScript**: +``` +TypeError: X is not a function +TypeError: Cannot convert undefined or null to object +``` + +### Import/Module Patterns + +**Python**: +``` +ModuleNotFoundError: No module named 'X' +ImportError: cannot import name 'X' from 'Y' +``` + +**JavaScript**: +``` +Error: Cannot find module 'X' +SyntaxError: Unexpected token 'export' +``` + +### Database Patterns + +**SQLAlchemy**: +``` +sqlalchemy.orm.exc.NoResultFound +sqlalchemy.exc.IntegrityError: UNIQUE constraint failed +sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) connection refused +``` + +**Drizzle ORM**: +``` +DrizzleError: Unique constraint failed on column: email +DrizzleError: Connection to database server failed +``` + +## Analysis Workflow + +### 1. Identify Exception Type + +```python +# Example +sqlalchemy.exc.IntegrityError: UNIQUE constraint failed: users.email +# ↓ +# Type: IntegrityError (database constraint) +# Subtype: UNIQUE (duplicate key) +``` + +### 2. Locate Root Frame in Your Code + +```python +Traceback (most recent call last): + File "/app/api/users.py", line 45, in create_user # ← Root frame + db.add(user) + File "/venv/lib/sqlalchemy/orm/session.py", line 2345, in add # ← Library + raise IntegrityError() +``` + +**Rule**: First frame in your project directory before library/framework code + +### 3. Trace Backwards Through Call Chain + +```python +create_user (users.py:45) + ↓ calls +UserService.create (user_service.py:23) + ↓ calls +db.add (sqlalchemy) → IntegrityError +``` + +**Analysis**: Error originates in `db.add`, propagates through `UserService.create`, surfaces in `create_user` endpoint + +### 4. Identify Data Flow + +```python +# users.py:45 +user = User(email=request.email) # ← Where does request.email come from? +db.add(user) # ← Fails with UNIQUE constraint + +# Trace back: +# request.email ← Request body +# ← Client sent duplicate email +# ← Need validation before DB insert +``` + +### 5. Formulate Hypothesis + +**Pattern**: UNIQUE constraint → Attempting duplicate insert +**Root Cause**: No existence check before insert +**Fix**: Add existence check or use upsert + +## Advanced Patterns + +### Recursive Stack Traces + +```python +RecursionError: maximum recursion depth exceeded + File "/app/services/tree.py", line 23, in calculate_depth + return 1 + calculate_depth(node.parent) + File "/app/services/tree.py", line 23, in calculate_depth + return 1 + calculate_depth(node.parent) + [Previous line repeated 996 more times] +``` + +**Analysis**: Circular reference in `node.parent` chain + +**Fix**: Add base case or cycle detection + +### Chained Exceptions (Python 3) + +```python +Traceback (most recent call last): + File "/app/db/connection.py", line 15, in connect + engine.connect() + sqlalchemy.exc.OperationalError: connection refused + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/app/api/users.py", line 45, in get_user + db.connect() + File "/app/db/connection.py", line 20, in connect + raise DatabaseConnectionError() from e +app.exceptions.DatabaseConnectionError: Failed to connect to database +``` + +**Reading**: Two stack traces: +1. Original: `OperationalError` (connection refused) +2. Wrapped: `DatabaseConnectionError` (user-friendly message) + +**Root Cause**: Original exception (connection refused) + +### Multiple Exception Points + +```javascript +UnhandledPromiseRejectionWarning: Error: API request failed + at fetchData (api.js:23:11) + (node:12345) UnhandledPromiseRejectionWarning: Unhandled promise rejection. +``` + +**Analysis**: Promise rejected but no `.catch()` handler + +**Fix**: Add `.catch()` or use `try/await/catch` + +## Quick Reference + +| Language | Read Direction | Focus On | Ignore | +|----------|---------------|----------|--------| +| **Python** | Bottom-up | Last frame in your code | `/venv/`, stdlib | +| **JavaScript** | Top-down | First frame in your code | `node_modules/` | +| **Java** | Top-down | `com.example.*` | `org.springframework.*` | +| **TypeScript** | Top-down | `src/`, `.ts` files | `node_modules/`, `.min.js` | + +| Error Pattern | Stack Trace Indicator | Fix Priority | +|---------------|----------------------|--------------| +| **Null/undefined** | `NoneType`, `null`, `undefined` | High | +| **Type mismatch** | `unsupported operand`, `is not a function` | Medium | +| **Import error** | `ModuleNotFoundError`, `Cannot find module` | High | +| **Database** | `IntegrityError`, `OperationalError` | High | +| **Async** | `await`, `Promise`, `Coroutine` | Medium | + +--- + +**Usage**: When debugging, identify exception type, locate root frame in your code, trace backwards through call chain, identify data flow, formulate hypothesis. diff --git a/skills/smart-debugging/templates/rca-template.md b/skills/smart-debugging/templates/rca-template.md new file mode 100644 index 0000000..7b027b4 --- /dev/null +++ b/skills/smart-debugging/templates/rca-template.md @@ -0,0 +1,388 @@ +# Root Cause Analysis: [Title] + +**Instructions**: Copy this template and fill in all sections. Delete instructional text in [brackets] when complete. + +--- + +## Metadata + +**Date**: [YYYY-MM-DD when RCA was conducted] +**Incident Date**: [YYYY-MM-DD when incident occurred] +**Incident ID**: [INC-12345 or ticket number] +**Severity**: [SEV1 (Critical) / SEV2 (Major) / SEV3 (Minor)] +**Status**: [Draft / In Review / Completed] +**Owner**: [Name of person conducting RCA] +**Reviewers**: [Names of people who reviewed this RCA] +**Related Issues**: [Links to tickets, PRs, or incidents] + +--- + +## Executive Summary + +[2-3 sentence summary covering: What broke? What was the impact? What was the root cause?] + +**Example**: +> On 2025-11-05 at 14:32 UTC, the user profile API began returning 500 errors for approximately 15% of requests. Users were unable to view or update their profiles for 2.5 hours. Root cause was a missing null check after a recent code change that allowed deleted users to remain in cache, causing null pointer errors when their profiles were accessed. + +--- + +## Impact + +### User Impact + +**Users Affected**: [Number or percentage] +- [Describe which users were impacted and how] + +**Duration**: [Time from detection to resolution] +- **Detected**: [YYYY-MM-DD HH:MM UTC] +- **Mitigated**: [YYYY-MM-DD HH:MM UTC] +- **Resolved**: [YYYY-MM-DD HH:MM UTC] +- **Total Duration**: [X hours Y minutes] + +**User Experience**: [Describe what users saw/experienced] +- [e.g., "Error page when accessing profile"] +- [e.g., "Unable to complete checkout"] + +### Business Impact + +**Revenue Impact**: [Estimated or actual] +- [e.g., "$12,000 in lost transactions"] +- [e.g., "N/A - no direct revenue impact"] + +**SLA Impact**: [If applicable] +- [e.g., "99.9% uptime SLA breached (99.4% actual)"] +- [e.g., "Within SLA bounds"] + +**Reputation Impact**: [Qualitative assessment] +- [e.g., "12 support tickets, 5 negative social media mentions"] +- [e.g., "Minimal - caught before user reports"] + +--- + +## Timeline + +[Chronological sequence of events. Use relative timestamps (T-00:XX) or absolute (HH:MM UTC)] + +| Time | Event | System State | Action Taken | +|------|-------|--------------|--------------| +| [T-00:10] | [What happened] | [Relevant system state] | [What was done] | +| [T-00:05] | [What happened] | [Relevant system state] | [What was done] | +| [T+00:00] | [Incident start - first error] | [System state at incident] | [Initial response] | +| [T+00:15] | [Detection/alerting] | [How issue was discovered] | [Investigation started] | +| [T+00:45] | [Mitigation deployed] | [Temporary fix applied] | [What was mitigated] | +| [T+02:30] | [Resolution deployed] | [Permanent fix deployed] | [What was fixed] | + +**Example**: +| Time | Event | System State | Action Taken | +|------|-------|--------------|--------------| +| T-00:15 | User deletion feature deployed | Cache invalidation logic updated | Normal deployment | +| T-00:10 | Admin deleted test user account | User removed from DB, cache clear initiated | User deleted successfully | +| T-00:05 | Cache clear completed (async) | User data removed from cache | Background job finished | +| T+00:00 | Profile API starts returning 500 errors | Null pointer accessing deleted user's data | Errors logged but not alerted | +| T+00:15 | Error spike detected by monitoring | 15% error rate on /users/:id endpoint | On-call paged | +| T+00:30 | Investigation identified null pointer | Deleted users in cache causing errors | Root cause hypothesis formed | +| T+00:45 | Quick fix deployed (null check) | 500 errors stopped | Mitigation successful | +| T+02:30 | Proper fix deployed (validation) | Comprehensive null handling added | Permanent resolution | + +--- + +## Root Cause + +### Problem Statement + +[Clear, specific description of what went wrong] + +**Example**: +> User profile API returned 500 errors when accessing profiles of recently deleted users because the cache invalidation was asynchronous and completed after the user record was deleted from the database. When the API tried to access the cached user object, it was null, causing an unhandled null pointer exception. + +### 5 Whys Analysis + +**Why #1**: [First level cause] +**Evidence**: [Logs, metrics, stack traces, or other data supporting this] + +**Why #2**: [Deeper cause - why did #1 happen?] +**Evidence**: [Supporting data] + +**Why #3**: [Even deeper - why did #2 happen?] +**Evidence**: [Supporting data] + +**Why #4**: [Near root cause - why did #3 happen?] +**Evidence**: [Supporting data] + +**Why #5**: [Root cause - fundamental reason] +**Evidence**: [Supporting data] + +**Example**: + +**Why #1**: API returned 500 error (NullPointerException) +**Evidence**: Stack trace shows `'NoneType' object has no attribute 'name'` at `users.py:45` + +**Why #2**: User object was null when accessing `.name` property +**Evidence**: Database query returned None for user_id `usr_12345` + +**Why #3**: User ID didn't exist in database +**Evidence**: User was deleted 5 minutes before error occurred + +**Why #4**: Deleted user's profile was still being accessed +**Evidence**: Cache contained user_id `usr_12345` in "recently viewed" list + +**Why #5**: Cache invalidation completed after user record deletion +**Evidence**: Cache clear is asynchronous and took 3 seconds; user accessed profile during this window + +**Root Cause**: Asynchronous cache invalidation creates timing window where deleted users can be accessed before cache clears, causing null pointer errors. + +### Root Cause Category + +[Select one or more] +- [ ] Code Logic Error +- [ ] Missing Validation +- [ ] Missing Error Handling +- [ ] Configuration Error +- [ ] Infrastructure Issue +- [ ] Dependency Failure +- [ ] Performance/Scale Issue +- [ ] Race Condition/Timing Issue +- [ ] Security Vulnerability +- [ ] Data Quality Issue +- [ ] Process/Communication Gap + +--- + +## Contributing Factors + +[List factors that enabled or worsened the incident, but aren't the primary root cause] + +1. **[Factor Name]**: [Description] + - **Impact**: [How it contributed] + - **Evidence**: [Supporting data] + +2. **[Factor Name]**: [Description] + - **Impact**: [How it contributed] + - **Evidence**: [Supporting data] + +**Example**: + +1. **No Monitoring on User Deletion**: User deletion events had no metrics or alerts + - **Impact**: Delayed detection by 15 minutes (relied on error spike detection) + - **Evidence**: No alert fired until error rate threshold exceeded + +2. **Insufficient Error Handling**: API endpoint lacked null checks for user objects + - **Impact**: Null pointer became 500 error instead of graceful 404 + - **Evidence**: No null validation in `get_user_profile` function + +3. **Missing Integration Tests**: No test covered "access recently deleted user" scenario + - **Impact**: Issue not caught before production deployment + - **Evidence**: Test suite has 0 tests for deleted user scenarios + +--- + +## Prevention + +### Immediate Actions (Within 24 hours) + +**Target Completion**: [Date] + +- [ ] **[Action 1]**: [Description] + - **Owner**: [Name] + - **Status**: [Not Started / In Progress / Completed] + - **Completion Date**: [YYYY-MM-DD] + +- [ ] **[Action 2]**: [Description] + - **Owner**: [Name] + - **Status**: [Not Started / In Progress / Completed] + - **Completion Date**: [YYYY-MM-DD] + +**Example**: +- [x] **Add null checks**: Add defensive null checks in all user profile endpoints + - **Owner**: Alice Smith + - **Status**: Completed + - **Completion Date**: 2025-11-06 + - **PR**: #1234 + +### Short-term Actions (Within 1 week) + +**Target Completion**: [Date] + +- [ ] **[Action 1]**: [Description] + - **Owner**: [Name] + - **Status**: [Not Started / In Progress / Completed] + - **Completion Date**: [YYYY-MM-DD] + +- [ ] **[Action 2]**: [Description] + - **Owner**: [Name] + - **Status**: [Not Started / In Progress / Completed] + - **Completion Date**: [YYYY-MM-DD] + +**Example**: +- [ ] **Make cache invalidation synchronous**: Update user deletion to wait for cache clear + - **Owner**: Bob Johnson + - **Status**: In Progress + - **Target Date**: 2025-11-12 + - **Issue**: PROJ-567 + +- [ ] **Add monitoring for user deletions**: Track user deletion events and cache invalidation duration + - **Owner**: Carol Williams + - **Status**: Not Started + - **Target Date**: 2025-11-13 + - **Issue**: PROJ-568 + +### Long-term Actions (Within 1 month) + +**Target Completion**: [Date] + +- [ ] **[Action 1]**: [Description] + - **Owner**: [Name] + - **Status**: [Not Started / In Progress / Completed] + - **Completion Date**: [YYYY-MM-DD] + +- [ ] **[Action 2]**: [Description] + - **Owner**: [Name] + - **Status**: [Not Started / In Progress / Completed] + - **Completion Date**: [YYYY-MM-DD] + +**Example**: +- [ ] **Implement soft-delete pattern**: Replace hard deletes with soft deletes (deleted_at timestamp) + - **Owner**: Alice Smith + - **Status**: Not Started + - **Target Date**: 2025-12-06 + - **Issue**: PROJ-569 + +- [ ] **Add comprehensive deleted user test suite**: Cover all scenarios involving deleted users + - **Owner**: David Lee + - **Status**: Not Started + - **Target Date**: 2025-12-13 + - **Issue**: PROJ-570 + +--- + +## Technical Details + +### Code Changes + +**Files Modified**: +- [File path 1]: [Description of changes] +- [File path 2]: [Description of changes] + +**Key Code Snippets**: + +```python +# Before (buggy code) +[paste relevant buggy code] +``` + +```python +# After (fixed code) +[paste fixed code] +``` + +### Monitoring and Metrics + +**Key Metrics During Incident**: +- [Metric name]: [Value during incident] (baseline: [normal value]) +- [Metric name]: [Value during incident] (baseline: [normal value]) + +**Example**: +- API Error Rate: 15% (baseline: 0.1%) +- Response Time (p99): 5.2s (baseline: 200ms) +- Cache Hit Rate: 65% (baseline: 95%) + +**New Monitoring Added**: +- [New alert/dashboard/metric 1] +- [New alert/dashboard/metric 2] + +--- + +## Lessons Learned + +### What Went Well + +1. [Positive aspect 1 - e.g., "Quick detection via monitoring"] +2. [Positive aspect 2 - e.g., "Clear communication during incident"] +3. [Positive aspect 3 - e.g., "Effective rollback procedure"] + +### What Could Be Improved + +1. [Improvement area 1 - e.g., "Better integration test coverage"] +2. [Improvement area 2 - e.g., "Faster root cause identification"] +3. [Improvement area 3 - e.g., "More comprehensive error handling"] + +### Action Items for Process Improvement + +- [ ] **[Process improvement 1]**: [Description] + - **Owner**: [Name] + - **Target Date**: [YYYY-MM-DD] + +- [ ] **[Process improvement 2]**: [Description] + - **Owner**: [Name] + - **Target Date**: [YYYY-MM-DD] + +**Example**: +- [ ] **Update deployment checklist**: Add "verify cache invalidation timing" step + - **Owner**: DevOps Team + - **Target Date**: 2025-11-20 + +- [ ] **Create runbook for cache issues**: Document cache investigation procedures + - **Owner**: SRE Team + - **Target Date**: 2025-11-27 + +--- + +## Review and Sign-off + +### RCA Review Checklist + +Before finalizing this RCA, verify: + +- [ ] Root cause clearly identified with evidence +- [ ] Timeline is accurate and complete +- [ ] All contributing factors documented +- [ ] Prevention actions are specific and actionable +- [ ] All action items have owners and due dates +- [ ] Technical details are accurate and helpful +- [ ] Lessons learned capture key takeaways +- [ ] No blame directed at individuals + +### Reviewers + +| Reviewer | Role | Date Reviewed | Approved | +|----------|------|---------------|----------| +| [Name] | [Engineering Lead / SRE / etc.] | [YYYY-MM-DD] | ☐ Yes ☐ No | +| [Name] | [Engineering Lead / SRE / etc.] | [YYYY-MM-DD] | ☐ Yes ☐ No | + +### Sign-off + +**RCA Completed By**: [Name] +**Date**: [YYYY-MM-DD] + +**Incident Commander**: [Name] +**Date**: [YYYY-MM-DD] + +--- + +## Follow-up + +**Next Review Date**: [YYYY-MM-DD - typically 2-4 weeks after incident] + +**Follow-up Items**: +- [ ] Verify all immediate actions completed +- [ ] Check progress on short-term actions +- [ ] Review long-term action planning +- [ ] Assess effectiveness of prevention measures +- [ ] Share learnings with broader team + +--- + +## Additional Notes + +[Any additional context, references, or notes that don't fit above sections] + +**References**: +- [Link to incident Slack channel] +- [Link to monitoring dashboard] +- [Link to related RCAs] +- [Link to fix PRs] + +--- + +**Template Version**: 1.0 +**Last Updated**: 2025-11-06