From ebc71f5387e2205682f457b2aeb7dd693f0b76a8 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 18:29:23 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 15 + README.md | 3 + plugin.lock.json | 177 ++++++ skills/devops-troubleshooting/SKILL.md | 26 + .../devops-troubleshooting/examples/INDEX.md | 68 ++ .../cloudflare-worker-deployment-failure.md | 466 ++++++++++++++ .../examples/distributed-system-debugging.md | 477 ++++++++++++++ .../performance-degradation-analysis.md | 413 ++++++++++++ .../examples/planetscale-connection-issues.md | 499 +++++++++++++++ .../devops-troubleshooting/reference/INDEX.md | 72 +++ .../reference/cloudflare-workers-guide.md | 472 ++++++++++++++ .../reference/diagnostic-commands.md | 473 ++++++++++++++ .../reference/troubleshooting-runbooks.md | 489 ++++++++++++++ .../devops-troubleshooting/templates/INDEX.md | 81 +++ skills/memory-profiling/SKILL.md | 85 +++ skills/memory-profiling/examples/INDEX.md | 86 +++ .../examples/database-connection-leak.md | 490 ++++++++++++++ .../examples/large-dataset-optimization.md | 452 +++++++++++++ .../examples/nodejs-memory-leak.md | 490 ++++++++++++++ .../examples/python-scalene-profiling.md | 456 +++++++++++++ skills/memory-profiling/reference/INDEX.md | 75 +++ .../reference/garbage-collection-guide.md | 392 ++++++++++++ .../reference/memory-optimization-patterns.md | 371 +++++++++++ .../reference/profiling-tools.md | 407 ++++++++++++ skills/memory-profiling/templates/INDEX.md | 60 ++ .../templates/memory-report-template.md | 322 ++++++++++ skills/observability-engineering/SKILL.md | 26 + .../observability-setup-checklist.md | 600 ++++++++++++++++++ .../examples/INDEX.md | 136 ++++ .../reference/INDEX.md | 67 ++ .../templates/INDEX.md | 72 +++ .../templates/grafana-dashboard.json | 210 ++++++ .../templates/prometheus-recording-rules.yaml | 188 ++++++ .../templates/slo-definition.yaml | 173 +++++ skills/observability-monitoring/SKILL.md | 413 ++++++++++++ .../examples/INDEX.md | 48 ++ .../reference/INDEX.md | 32 + 37 files changed, 9382 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/devops-troubleshooting/SKILL.md create mode 100644 skills/devops-troubleshooting/examples/INDEX.md create mode 100644 skills/devops-troubleshooting/examples/cloudflare-worker-deployment-failure.md create mode 100644 skills/devops-troubleshooting/examples/distributed-system-debugging.md create mode 100644 skills/devops-troubleshooting/examples/performance-degradation-analysis.md create mode 100644 skills/devops-troubleshooting/examples/planetscale-connection-issues.md create mode 100644 skills/devops-troubleshooting/reference/INDEX.md create mode 100644 skills/devops-troubleshooting/reference/cloudflare-workers-guide.md create mode 100644 skills/devops-troubleshooting/reference/diagnostic-commands.md create mode 100644 skills/devops-troubleshooting/reference/troubleshooting-runbooks.md create mode 100644 skills/devops-troubleshooting/templates/INDEX.md create mode 100644 skills/memory-profiling/SKILL.md create mode 100644 skills/memory-profiling/examples/INDEX.md create mode 100644 skills/memory-profiling/examples/database-connection-leak.md create mode 100644 skills/memory-profiling/examples/large-dataset-optimization.md create mode 100644 skills/memory-profiling/examples/nodejs-memory-leak.md create mode 100644 skills/memory-profiling/examples/python-scalene-profiling.md create mode 100644 skills/memory-profiling/reference/INDEX.md create mode 100644 skills/memory-profiling/reference/garbage-collection-guide.md create mode 100644 skills/memory-profiling/reference/memory-optimization-patterns.md create mode 100644 skills/memory-profiling/reference/profiling-tools.md create mode 100644 skills/memory-profiling/templates/INDEX.md create mode 100644 skills/memory-profiling/templates/memory-report-template.md create mode 100644 skills/observability-engineering/SKILL.md create mode 100644 skills/observability-engineering/checklists/observability-setup-checklist.md create mode 100644 skills/observability-engineering/examples/INDEX.md create mode 100644 skills/observability-engineering/reference/INDEX.md create mode 100644 skills/observability-engineering/templates/INDEX.md create mode 100644 skills/observability-engineering/templates/grafana-dashboard.json create mode 100644 skills/observability-engineering/templates/prometheus-recording-rules.yaml create mode 100644 skills/observability-engineering/templates/slo-definition.yaml create mode 100644 skills/observability-monitoring/SKILL.md create mode 100644 skills/observability-monitoring/examples/INDEX.md create mode 100644 skills/observability-monitoring/reference/INDEX.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..1fe7434 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "observability", + "description": "Production observability tools for monitoring, SLO implementation, alerting, and performance tracking. Includes monitoring setup, SLO configuration, distributed tracing, metrics collection, and observability best practices.", + "version": "1.0.0", + "author": { + "name": "Grey Haven Studio", + "url": "https://github.com/greyhaven-ai/claude-code-config" + }, + "skills": [ + "./skills/devops-troubleshooting", + "./skills/memory-profiling", + "./skills/observability-engineering", + "./skills/observability-monitoring" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f917c43 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# observability + +Production observability tools for monitoring, SLO implementation, alerting, and performance tracking. Includes monitoring setup, SLO configuration, distributed tracing, metrics collection, and observability best practices. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..1c1be87 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,177 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:greyhaven-ai/claude-code-config:grey-haven-plugins/observability", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "60d9ce50f1df1471eac64c59deb3ac68e586bef2", + "treeHash": "4175875d7e822f1a60864b1a608d1494f7786c5b8242e88a1d0e8f44e9033c5a", + "generatedAt": "2025-11-28T10:17:04.969342Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "observability", + "description": "Production observability tools for monitoring, SLO implementation, alerting, and performance tracking. Includes monitoring setup, SLO configuration, distributed tracing, metrics collection, and observability best practices.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "1f28cfb2ad225c8c1c9acacb2bc8d1a03fc80cc86e66722f18a865f0974a1b9b" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "9a2cb98f22fd8a1b26e05ae729d25ebde4db0b0691206c7e6b62d7554ff1a895" + }, + { + "path": "skills/observability-monitoring/SKILL.md", + "sha256": "181fd6bcd5c539cc82e06f9e4034e3a07c2d61caed6ce704eb678b16def76604" + }, + { + "path": "skills/observability-monitoring/examples/INDEX.md", + "sha256": "5ca3c63b259270d6b828e91d6e72262b18db7f9b5f901e998f590f6a06226b66" + }, + { + "path": "skills/observability-monitoring/reference/INDEX.md", + "sha256": "15e847b259c0c9a102a4b05ec30dcb26e78e7385f410cb1c26761584830d0e14" + }, + { + "path": "skills/devops-troubleshooting/SKILL.md", + "sha256": "74041e78f0824fb7b8655ee59aa6933a8bd81d52e3e5942b4be95f345794c8b5" + }, + { + "path": "skills/devops-troubleshooting/examples/cloudflare-worker-deployment-failure.md", + "sha256": "bbd15c5d4ee3923c7a16ad023bdc3850e3a8c23d5a28088f69aff2c7b2c6e11a" + }, + { + "path": "skills/devops-troubleshooting/examples/distributed-system-debugging.md", + "sha256": "f7aaf5708cfb89bf05c973468f54868f1757a767ab22c7b2cd8031e629f3a708" + }, + { + "path": "skills/devops-troubleshooting/examples/INDEX.md", + "sha256": "52140e45ca3e431b80db5fbc51ac010b328bc661343d3acb5c40306b7d98ee2a" + }, + { + "path": "skills/devops-troubleshooting/examples/performance-degradation-analysis.md", + "sha256": "0af6c98b4b18663c22ce7ba50dcb9fc83dc757d7260777be15f56fa696707813" + }, + { + "path": "skills/devops-troubleshooting/examples/planetscale-connection-issues.md", + "sha256": "6d5967526aabbf5dbeedec8fba6818674026f73d3d9e2dfe66a23d99094d7832" + }, + { + "path": "skills/devops-troubleshooting/templates/INDEX.md", + "sha256": "1eccf9927cd188d5a4e771624c7030e621685b680828891d29f76fa39f4a8dd9" + }, + { + "path": "skills/devops-troubleshooting/reference/diagnostic-commands.md", + "sha256": "fadba46e2413f4f17306b71385332e3e18b47daf5fd68609fc7bf71d234d128d" + }, + { + "path": "skills/devops-troubleshooting/reference/cloudflare-workers-guide.md", + "sha256": "66b4218314cf61ecb00706fc51137fee7b065fb974f29780a94bed4b993a81a6" + }, + { + "path": "skills/devops-troubleshooting/reference/INDEX.md", + "sha256": "6194ad0c3d76731fe249c425182c881e4eef94a0258a65301c158711c0e19a2c" + }, + { + "path": "skills/devops-troubleshooting/reference/troubleshooting-runbooks.md", + "sha256": "de2c070c80a9996b876b72c20fcbf7fe0ce50986424bcaf23c00458c4b0549e4" + }, + { + "path": "skills/memory-profiling/SKILL.md", + "sha256": "4b095fac61d51dea2b44738e146acc80a9503e3d5c7f387cfed70dbb6d8d1c03" + }, + { + "path": "skills/memory-profiling/examples/python-scalene-profiling.md", + "sha256": "5f5bf5953dc590ad717c3cd723ba6664448572f8c5be4ee425a0f744573b6566" + }, + { + "path": "skills/memory-profiling/examples/database-connection-leak.md", + "sha256": "7859418761c823dd58258537e7621fa9d6f9b3957975aa89cfc7ff77d6eedcb5" + }, + { + "path": "skills/memory-profiling/examples/large-dataset-optimization.md", + "sha256": "aa9f8f8cc5ed4afb178119a0146740b5625cdf7e4c6734c2fc9cf58f5ca41095" + }, + { + "path": "skills/memory-profiling/examples/INDEX.md", + "sha256": "f2c483d506c9f8d04175d8ea3d7a4433b7af2ebac873fe447b841456df67fa5d" + }, + { + "path": "skills/memory-profiling/examples/nodejs-memory-leak.md", + "sha256": "75b8b3f9dc55e4b730d822c791265ef539853ad464db44fc4bd3c1790ec93c30" + }, + { + "path": "skills/memory-profiling/templates/memory-report-template.md", + "sha256": "818a811550719641e3221b892bbaade23edcc09a8cfc6ff4c03dab609410c3d6" + }, + { + "path": "skills/memory-profiling/templates/INDEX.md", + "sha256": "214e7aa7f05809e054b69d4b0082cf3871e457ff029a842a3d03caecb03f8dee" + }, + { + "path": "skills/memory-profiling/reference/INDEX.md", + "sha256": "335d0519f7bc1efaaf2d3d39e194c649ba180c9c10f80cb637345ccabf4e40c4" + }, + { + "path": "skills/memory-profiling/reference/garbage-collection-guide.md", + "sha256": "10fe4eed877cebe1b0a4d881033f03df829995bf55f4343c48bdb5663c0c96e3" + }, + { + "path": "skills/memory-profiling/reference/profiling-tools.md", + "sha256": "1f66ca8c10d25885172edfa4951864231d088508a2f28bebf561a37d24ea17a4" + }, + { + "path": "skills/memory-profiling/reference/memory-optimization-patterns.md", + "sha256": "9658cf2c1ff8b93efb04b19986ce81507d757c890afdbd7feb5bd0b3452c7ab4" + }, + { + "path": "skills/observability-engineering/SKILL.md", + "sha256": "9b840997afb17b9d641b0d73d81b2f24af109347de1357219837739060cf1a2a" + }, + { + "path": "skills/observability-engineering/checklists/observability-setup-checklist.md", + "sha256": "e0e3e5bdf24d8516f03a6bc6a018a4336a3bcbabe255d0a520143ad01f7e59da" + }, + { + "path": "skills/observability-engineering/examples/INDEX.md", + "sha256": "497b1279170aef7bf6f48045a0a960595bc509e6dffa0654328240dec9e59c11" + }, + { + "path": "skills/observability-engineering/templates/grafana-dashboard.json", + "sha256": "83c364eaa44e86b2d0f7c7044fd05550cd202c2ec95dab9e278af7fd718c3dbf" + }, + { + "path": "skills/observability-engineering/templates/slo-definition.yaml", + "sha256": "9706827e5acff4e25adbec105f90ded2532b2e0761b2c3331b623479d8ba947e" + }, + { + "path": "skills/observability-engineering/templates/INDEX.md", + "sha256": "984ff9d5f8b09ae63085990af383d7275b56ae3b80615e4430a6d48cfe844f5b" + }, + { + "path": "skills/observability-engineering/templates/prometheus-recording-rules.yaml", + "sha256": "49d1f54bb38e8612404fbf3faad9fe2505de816be693054095939a844a2870f8" + }, + { + "path": "skills/observability-engineering/reference/INDEX.md", + "sha256": "2c0520cc5bb3cd4b040cc9c1d3946000e1fbadeb67acda7e4fafc99e61604509" + } + ], + "dirSha256": "4175875d7e822f1a60864b1a608d1494f7786c5b8242e88a1d0e8f44e9033c5a" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/devops-troubleshooting/SKILL.md b/skills/devops-troubleshooting/SKILL.md new file mode 100644 index 0000000..c410e2a --- /dev/null +++ b/skills/devops-troubleshooting/SKILL.md @@ -0,0 +1,26 @@ +# DevOps Troubleshooting Skill + +DevOps and infrastructure troubleshooting for Cloudflare Workers, PlanetScale PostgreSQL, and distributed systems. + +## Description + +Infrastructure diagnosis, performance analysis, network debugging, and cloud platform troubleshooting. + +## What's Included + +- **Examples**: Deployment issues, connection errors, performance degradation +- **Reference**: Troubleshooting methodologies, common issues +- **Templates**: Diagnostic reports, fix commands + +## Use When + +- Deployment issues +- Infrastructure problems +- Connection errors +- Performance degradation + +## Related Agents + +- `devops-troubleshooter` + +**Skill Version**: 1.0 diff --git a/skills/devops-troubleshooting/examples/INDEX.md b/skills/devops-troubleshooting/examples/INDEX.md new file mode 100644 index 0000000..8009351 --- /dev/null +++ b/skills/devops-troubleshooting/examples/INDEX.md @@ -0,0 +1,68 @@ +# DevOps Troubleshooter Examples + +Real-world infrastructure troubleshooting scenarios for Grey Haven's Cloudflare Workers + PlanetScale PostgreSQL stack. + +## Examples Overview + +### 1. Cloudflare Worker Deployment Failure + +**File**: [cloudflare-worker-deployment-failure.md](cloudflare-worker-deployment-failure.md) +**Scenario**: Worker deployment fails with "Script exceeds size limit" error +**Stack**: Cloudflare Workers, wrangler, webpack bundling +**Impact**: Production deployment blocked, 2-hour downtime +**Resolution**: Bundle size reduction (5.2MB → 1.8MB), code splitting, tree shaking +**Lines**: ~450 lines + +### 2. PlanetScale Connection Pool Exhaustion + +**File**: [planetscale-connection-issues.md](planetscale-connection-issues.md) +**Scenario**: Database connection timeouts causing 503 errors +**Stack**: PlanetScale PostgreSQL, connection pooling, FastAPI +**Impact**: 15% of requests failing, customer complaints +**Resolution**: Connection pool tuning, connection leak fixes +**Lines**: ~430 lines + +### 3. Distributed System Network Debugging + +**File**: [distributed-system-debugging.md](distributed-system-debugging.md) +**Scenario**: Intermittent 504 Gateway Timeout errors between services +**Stack**: Cloudflare Workers, external APIs, DNS, CORS +**Impact**: 5% of API calls failing, no clear pattern +**Resolution**: DNS caching issue, worker timeout configuration +**Lines**: ~420 lines + +### 4. Performance Degradation Analysis + +**File**: [performance-degradation-analysis.md](performance-degradation-analysis.md) +**Scenario**: API response times increased from 200ms to 2000ms +**Stack**: Cloudflare Workers, PlanetScale, caching layer +**Impact**: User-facing slowness, poor UX +**Resolution**: N+1 query elimination, caching strategy, index optimization +**Lines**: ~410 lines + +--- + +## Quick Navigation + +**By Issue Type**: +- Deployment failures → [cloudflare-worker-deployment-failure.md](cloudflare-worker-deployment-failure.md) +- Database issues → [planetscale-connection-issues.md](planetscale-connection-issues.md) +- Network problems → [distributed-system-debugging.md](distributed-system-debugging.md) +- Performance issues → [performance-degradation-analysis.md](performance-degradation-analysis.md) + +**By Stack Component**: +- Cloudflare Workers → Examples 1, 3, 4 +- PlanetScale PostgreSQL → Examples 2, 4 +- Distributed Systems → Example 3 + +--- + +## Related Documentation + +- **Reference**: [Reference Index](../reference/INDEX.md) - Runbooks and diagnostic commands +- **Templates**: [Templates Index](../templates/INDEX.md) - Incident templates +- **Main Agent**: [devops-troubleshooter.md](../devops-troubleshooter.md) - DevOps troubleshooter agent + +--- + +Return to [main agent](../devops-troubleshooter.md) diff --git a/skills/devops-troubleshooting/examples/cloudflare-worker-deployment-failure.md b/skills/devops-troubleshooting/examples/cloudflare-worker-deployment-failure.md new file mode 100644 index 0000000..43eec28 --- /dev/null +++ b/skills/devops-troubleshooting/examples/cloudflare-worker-deployment-failure.md @@ -0,0 +1,466 @@ +# Cloudflare Worker Deployment Failure Investigation + +Complete troubleshooting workflow for "Script exceeds size limit" deployment failure, resolved through bundle optimization and code splitting. + +## Overview + +**Incident**: Worker deployment failing with size limit error +**Impact**: Production deployment blocked for 2 hours +**Root Cause**: Bundle size grew from 1.2MB to 5.2MB after adding dependencies +**Resolution**: Bundle optimization (code splitting, tree shaking) reduced size to 1.8MB +**Status**: Resolved + +## Incident Timeline + +| Time | Event | Action | +|------|-------|--------| +| 14:00 | Deployment initiated via CI/CD | `wrangler deploy` triggered | +| 14:02 | Deployment failed | Error: "Script exceeds 1MB size limit" | +| 14:05 | Investigation started | Check recent code changes | +| 14:15 | Root cause identified | New dependencies increased bundle size | +| 14:30 | Fix implemented | Bundle optimization applied | +| 14:45 | Fix deployed | Successful deployment to production | +| 16:00 | Monitoring complete | Confirmed stable deployment | + +--- + +## Symptoms and Detection + +### Initial Error + +**Deployment Command**: +```bash +$ wrangler deploy +✘ [ERROR] Script exceeds the size limit (5.2MB > 1MB after compression) +``` + +**CI/CD Pipeline Failure**: +```yaml +# GitHub Actions output +Step: Deploy to Cloudflare Workers + ✓ Build completed (5.2MB bundle) + ✗ Deployment failed: Script size exceeds limit + Error: Workers Free plan limit is 1MB compressed +``` + +**Impact**: +- Production deployment blocked +- New features stuck in staging +- Team unable to deploy hotfixes + +--- + +## Diagnosis + +### Step 1: Check Bundle Size + +**Before Investigation**: +```bash +# Build the worker locally +npm run build + +# Check output size +ls -lh dist/ +-rw-r--r-- 1 user staff 5.2M Dec 5 14:10 worker.js +``` + +**Analyze Bundle Composition**: +```bash +# Use webpack-bundle-analyzer +npm install --save-dev webpack-bundle-analyzer + +# Add to webpack.config.js +const BundleAnalyzerPlugin = require('webpack-bundle-analyzer').BundleAnalyzerPlugin; + +module.exports = { + plugins: [ + new BundleAnalyzerPlugin() + ] +}; + +# Build and open analyzer +npm run build +# Opens http://127.0.0.1:8888 with visual bundle breakdown +``` + +**Bundle Analyzer Findings**: +``` +Total Size: 5.2MB + +Breakdown: +- @anthropic-ai/sdk: 2.1MB (40%) +- aws-sdk: 1.8MB (35%) +- lodash: 800KB (15%) +- moment: 300KB (6%) +- application code: 200KB (4%) +``` + +**Red Flags**: +1. Full `aws-sdk` imported (only needed S3) +2. Entire `lodash` library (only using 3 functions) +3. `moment` included (native Date API would suffice) +4. Large AI SDK (only using text generation) + +--- + +### Step 2: Identify Recent Changes + +**Git Diff**: +```bash +# Check what changed in last deploy +git diff HEAD~1 HEAD -- src/ + +# Key changes: ++ import { Anthropic } from '@anthropic-ai/sdk'; ++ import AWS from 'aws-sdk'; ++ import _ from 'lodash'; ++ import moment from 'moment'; +``` + +**PR Analysis**: +``` +PR #234: Add AI content generation feature +- Added @anthropic-ai/sdk (full SDK) +- Added AWS S3 integration (full aws-sdk) +- Used lodash for data manipulation +- Used moment for date formatting + +Result: Bundle size increased by 4MB +``` + +--- + +### Step 3: Cloudflare Worker Size Limits + +**Plan Limits**: +``` +Workers Free: 1MB compressed +Workers Paid: 10MB compressed + +Current plan: Workers Free +Current size: 5.2MB (over limit) + +Options: +1. Upgrade to Workers Paid ($5/month) +2. Reduce bundle size to <1MB +3. Split into multiple workers +``` + +**Decision**: Reduce bundle size (no budget for upgrade) + +--- + +## Resolution + +### Fix 1: Tree Shaking with Named Imports + +**Before** (imports entire libraries): +```typescript +// ❌ BAD: Imports full library +import _ from 'lodash'; +import moment from 'moment'; +import AWS from 'aws-sdk'; + +// Usage: +const unique = _.uniq(array); +const date = moment().format('YYYY-MM-DD'); +const s3 = new AWS.S3(); +``` + +**After** (imports only needed functions): +```typescript +// ✅ GOOD: Named imports enable tree shaking +import { uniq, map, filter } from 'lodash-es'; +import { S3Client, PutObjectCommand } from '@aws-sdk/client-s3'; + +// ✅ BETTER: Native alternatives +const unique = [...new Set(array)]; +const date = new Date().toISOString().split('T')[0]; + +// S3 client (v3 - modular) +const s3 = new S3Client({ region: 'us-east-1' }); +``` + +**Size Reduction**: +``` +Before: +- lodash: 800KB → lodash-es tree-shaken: 50KB (94% reduction) +- moment: 300KB → native Date: 0KB (100% reduction) +- aws-sdk: 1.8MB → @aws-sdk/client-s3: 200KB (89% reduction) +``` + +--- + +### Fix 2: External Dependencies (Don't Bundle Large SDKs) + +**Before**: +```typescript +// worker.ts - bundled @anthropic-ai/sdk (2.1MB) +import { Anthropic } from '@anthropic-ai/sdk'; + +const client = new Anthropic({ + apiKey: env.ANTHROPIC_API_KEY +}); +``` + +**After** (use fetch directly): +```typescript +// worker.ts - use native fetch (0KB) +async function callAnthropic(prompt: string, env: Env) { + const response = await fetch('https://api.anthropic.com/v1/messages', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': env.ANTHROPIC_API_KEY, + 'anthropic-version': '2023-06-01' + }, + body: JSON.stringify({ + model: 'claude-3-sonnet-20240229', + max_tokens: 1024, + messages: [ + { role: 'user', content: prompt } + ] + }) + }); + + return response.json(); +} +``` + +**Size Reduction**: +``` +Before: @anthropic-ai/sdk: 2.1MB +After: Native fetch: 0KB +Savings: 2.1MB (100% reduction) +``` + +--- + +### Fix 3: Code Splitting (Async Imports) + +**Before** (everything bundled): +```typescript +// worker.ts +import { expensiveFunction } from './expensive-module'; + +export default { + async fetch(request: Request, env: Env) { + // Even if not used, expensive-module is in bundle + if (request.url.includes('/special')) { + return expensiveFunction(request); + } + return new Response('OK'); + } +}; +``` + +**After** (lazy load): +```typescript +// worker.ts +export default { + async fetch(request: Request, env: Env) { + if (request.url.includes('/special')) { + // Only load when needed (separate chunk) + const { expensiveFunction } = await import('./expensive-module'); + return expensiveFunction(request); + } + return new Response('OK'); + } +}; +``` + +**Size Reduction**: +``` +Main bundle: 1.8MB → 500KB (72% reduction) +expensive-module chunk: Loaded on-demand (lazy) +``` + +--- + +### Fix 4: Webpack Configuration Optimization + +**Updated webpack.config.js**: +```javascript +const webpack = require('webpack'); +const path = require('path'); + +module.exports = { + entry: './src/worker.ts', + target: 'webworker', + mode: 'production', + optimization: { + minimize: true, + usedExports: true, // Tree shaking + sideEffects: false, + }, + resolve: { + extensions: ['.ts', '.js'], + alias: { + // Replace heavy libraries with lighter alternatives + 'moment': 'date-fns', + 'lodash': 'lodash-es' + } + }, + module: { + rules: [ + { + test: /\.ts$/, + use: { + loader: 'ts-loader', + options: { + transpileOnly: true, + compilerOptions: { + module: 'esnext', // Enable tree shaking + moduleResolution: 'node' + } + } + }, + exclude: /node_modules/ + } + ] + }, + plugins: [ + new webpack.DefinePlugin({ + 'process.env.NODE_ENV': JSON.stringify('production') + }) + ], + output: { + filename: 'worker.js', + path: path.resolve(__dirname, 'dist'), + libraryTarget: 'commonjs2' + } +}; +``` + +--- + +## Results + +### Bundle Size Comparison + +| Category | Before | After | Reduction | +|----------|--------|-------|-----------| +| **@anthropic-ai/sdk** | 2.1MB | 0KB (fetch) | -100% | +| **aws-sdk** | 1.8MB | 200KB (v3) | -89% | +| **lodash** | 800KB | 50KB (tree-shaken) | -94% | +| **moment** | 300KB | 0KB (native Date) | -100% | +| **Application code** | 200KB | 200KB | 0% | +| **TOTAL** | **5.2MB** | **450KB** | **-91%** | + +**Compressed Size**: +- Before: 5.2MB → 1.8MB compressed (over 1MB limit) +- After: 450KB → 180KB compressed (under 1MB limit) + +--- + +### Deployment Verification + +**Successful Deployment**: +```bash +$ wrangler deploy +✔ Building... +✔ Validating... +Bundle size: 450KB (180KB compressed) +✔ Uploading... +✔ Deployed to production + +Production URL: https://api.greyhaven.io +Worker ID: worker-abc123 +``` + +**Load Testing**: +```bash +# Before optimization (would fail deployment) +# Bundle: 5.2MB, deploy: FAIL + +# After optimization +$ ab -n 1000 -c 10 https://api.greyhaven.io/ +Requests per second: 1250 [#/sec] +Time per request: 8ms [mean] +Successful requests: 1000 (100%) +Bundle size: 450KB ✓ +``` + +--- + +## Prevention Measures + +### 1. CI/CD Bundle Size Check + +```yaml +# .github/workflows/deploy.yml - Add size validation +steps: + - run: npm ci && npm run build + - name: Check bundle size + run: | + SIZE_MB=$(stat -f%z dist/worker.js | awk '{print $1/1048576}') + if (( $(echo "$SIZE_MB > 1.0" | bc -l) )); then + echo "❌ Bundle exceeds 1MB"; exit 1 + fi + - run: npx wrangler deploy +``` + +### 2. Pre-commit Hook + +```bash +# .git/hooks/pre-commit +SIZE_MB=$(stat -f%z dist/worker.js | awk '{print $1/1048576}') +[ "$SIZE_MB" -lt "1.0" ] || { echo "❌ Bundle >1MB"; exit 1; } +``` + +### 3. PR Template + +```markdown +## Bundle Impact +- [ ] Bundle size <800KB +- [ ] Tree shaking verified +Size: [Before → After] +``` + +### 4. Automated Analysis + +```json +{ + "scripts": { + "analyze": "webpack --profile --json > stats.json && webpack-bundle-analyzer stats.json" + } +} +``` + +--- + +## Lessons Learned + +### What Went Well + +✅ Identified root cause quickly (bundle analyzer) +✅ Multiple optimization strategies applied +✅ Achieved 91% bundle size reduction +✅ Added automated checks to prevent recurrence + +### What Could Be Improved + +❌ No bundle size monitoring before incident +❌ Dependencies added without size consideration +❌ No pre-commit checks for bundle size + +### Key Takeaways + +1. **Always check bundle size** when adding dependencies +2. **Use native APIs** instead of libraries when possible +3. **Tree shaking** requires named imports (not default) +4. **Code splitting** for rarely-used features +5. **External API calls** are lighter than bundling SDKs + +--- + +## Related Documentation + +- **PlanetScale Issues**: [planetscale-connection-issues.md](planetscale-connection-issues.md) +- **Network Debugging**: [distributed-system-debugging.md](distributed-system-debugging.md) +- **Performance**: [performance-degradation-analysis.md](performance-degradation-analysis.md) +- **Runbooks**: [../reference/troubleshooting-runbooks.md](../reference/troubleshooting-runbooks.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/devops-troubleshooting/examples/distributed-system-debugging.md b/skills/devops-troubleshooting/examples/distributed-system-debugging.md new file mode 100644 index 0000000..8ffd4d0 --- /dev/null +++ b/skills/devops-troubleshooting/examples/distributed-system-debugging.md @@ -0,0 +1,477 @@ +# Distributed System Network Debugging + +Investigating intermittent 504 Gateway Timeout errors between Cloudflare Workers and external APIs, resolved through DNS caching and timeout tuning. + +## Overview + +**Incident**: 5% of API requests failing with 504 timeouts +**Impact**: Intermittent failures, no clear pattern, user frustration +**Root Cause**: DNS resolution delays + worker timeout too aggressive +**Resolution**: DNS caching + timeout increase (5s→30s) +**Status**: Resolved + +## Incident Timeline + +| Time | Event | Action | +|------|-------|--------| +| 14:00 | 504 errors detected | Alerts triggered | +| 14:10 | Pattern analysis started | Check logs, no obvious cause | +| 14:30 | Network trace performed | Found DNS delays | +| 14:50 | Root cause identified | DNS + timeout combination | +| 15:10 | Fix deployed | DNS caching + timeout tuning | +| 15:40 | Monitoring confirmed | 504s eliminated | + +--- + +## Symptoms and Detection + +### Initial Alerts + +**Error Pattern**: +``` +[ERROR] Request to https://api.partner.com/data failed: 504 Gateway Timeout +[ERROR] Upstream timeout after 5000ms +[ERROR] DNS lookup took 3200ms (80% of timeout!) +``` + +**Characteristics**: +- ❌ Random occurrence (5% of requests) +- ❌ No pattern by time of day +- ❌ Affects all worker regions equally +- ❌ External API reports no issues +- ✅ Only affects specific external endpoints + +--- + +## Diagnosis + +### Step 1: Network Request Breakdown + +**curl Timing Analysis**: +```bash +# Test external API with detailed timing +curl -w "\nDNS: %{time_namelookup}s\nConnect: %{time_connect}s\nTLS: %{time_appconnect}s\nStart: %{time_starttransfer}s\nTotal: %{time_total}s\n" \ + -o /dev/null -s https://api.partner.com/data + +# Results (intermittent): +DNS: 3.201s # ❌ Very slow! +Connect: 3.450s +TLS: 3.780s +Start: 4.120s +Total: 4.823s # Close to 5s worker timeout +``` + +**Fast vs Slow Requests**: +``` +FAST (95% of requests): +DNS: 0.050s → Connect: 0.120s → Total: 0.850s ✅ + +SLOW (5% of requests): +DNS: 3.200s → Connect: 3.450s → Total: 4.850s ❌ (near timeout) +``` + +**Root Cause**: DNS resolution delays causing total request time to exceed worker timeout. + +--- + +### Step 2: DNS Investigation + +**nslookup Testing**: +```bash +# Test DNS resolution +time nslookup api.partner.com + +# Results (vary): +Run 1: 0.05s ✅ +Run 2: 3.10s ❌ +Run 3: 0.04s ✅ +Run 4: 2.95s ❌ + +Pattern: DNS cache miss causes 3s delay +``` + +**dig Analysis**: +```bash +# Detailed DNS query +dig api.partner.com +stats + +# Results: +;; Query time: 3021 msec # Slow! +;; SERVER: 1.1.1.1#53(1.1.1.1) +;; WHEN: Thu Dec 05 14:25:32 UTC 2024 +;; MSG SIZE rcvd: 84 + +# Root cause: No DNS caching in worker +``` + +--- + +### Step 3: Worker Timeout Configuration + +**Current Worker Code**: +```typescript +// worker.ts (BEFORE - Too aggressive timeout) +export default { + async fetch(request: Request, env: Env) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 5000); // 5s timeout + + try { + const response = await fetch('https://api.partner.com/data', { + signal: controller.signal + }); + return response; + } catch (error) { + // 5% of requests timeout here + return new Response('Gateway Timeout', { status: 504 }); + } finally { + clearTimeout(timeout); + } + } +}; +``` + +**Problem**: 5s timeout doesn't account for DNS delays (up to 3s). + +--- + +### Step 4: CORS and Headers Check + +**Test CORS Headers**: +```bash +# Check CORS preflight +curl -I -X OPTIONS https://api.greyhaven.io/proxy \ + -H "Origin: https://app.greyhaven.io" \ + -H "Access-Control-Request-Method: POST" + +# Response: +HTTP/2 200 +access-control-allow-origin: https://app.greyhaven.io ✅ +access-control-allow-methods: GET, POST, PUT, DELETE ✅ +access-control-max-age: 86400 ✅ +``` + +**No CORS issues** - problem isolated to DNS + timeout. + +--- + +## Resolution + +### Fix 1: Implement DNS Caching + +**Worker with DNS Cache**: +```typescript +// worker.ts (AFTER - With DNS caching) +interface DnsCache { + ip: string; + timestamp: number; + ttl: number; +} + +const DNS_CACHE = new Map(); +const DNS_TTL = 60 * 1000; // 60 seconds + +async function resolveWithCache(hostname: string): Promise { + const cached = DNS_CACHE.get(hostname); + + if (cached && Date.now() - cached.timestamp < cached.ttl) { + // Cache hit - return immediately + return cached.ip; + } + + // Cache miss - resolve DNS + const dnsResponse = await fetch(`https://1.1.1.1/dns-query?name=${hostname}`, { + headers: { 'accept': 'application/dns-json' } + }); + const dnsData = await dnsResponse.json(); + const ip = dnsData.Answer[0].data; + + // Update cache + DNS_CACHE.set(hostname, { + ip, + timestamp: Date.now(), + ttl: DNS_TTL + }); + + return ip; +} + +export default { + async fetch(request: Request, env: Env) { + // Pre-resolve DNS (cached) + const ip = await resolveWithCache('api.partner.com'); + + // Use IP directly (bypass DNS) + const response = await fetch(`https://${ip}/data`, { + headers: { + 'Host': 'api.partner.com' // Required for SNI + } + }); + + return response; + } +}; +``` + +**Result**: DNS resolution <5ms (cache hit) vs 3000ms (cache miss). + +--- + +### Fix 2: Increase Worker Timeout + +**Updated Timeout**: +```typescript +// worker.ts - Increased timeout to account for DNS +const controller = new AbortController(); +const timeout = setTimeout(() => controller.abort(), 30000); // 30s timeout + +try { + const response = await fetch('https://api.partner.com/data', { + signal: controller.signal + }); + return response; +} finally { + clearTimeout(timeout); +} +``` + +**Timeout Breakdown**: +``` +Old: 5s total +- DNS: 3s (worst case) +- Connect: 1s +- Request: 1s += Frequent timeouts + +New: 30s total +- DNS: <0.01s (cached) +- Connect: 1s +- Request: 2s +- Buffer: 27s (ample) += No timeouts +``` + +--- + +### Fix 3: Add Retry Logic with Exponential Backoff + +**Retry Implementation**: +```typescript +// utils/retry.ts +async function fetchWithRetry( + url: string, + options: RequestInit, + maxRetries: number = 3 +): Promise { + for (let attempt = 0; attempt < maxRetries; attempt++) { + try { + const response = await fetch(url, options); + + // Retry on 5xx errors + if (response.status >= 500 && attempt < maxRetries - 1) { + const delay = Math.pow(2, attempt) * 1000; // Exponential backoff + await new Promise(resolve => setTimeout(resolve, delay)); + continue; + } + + return response; + } catch (error) { + if (attempt === maxRetries - 1) throw error; + + // Exponential backoff: 1s, 2s, 4s + const delay = Math.pow(2, attempt) * 1000; + await new Promise(resolve => setTimeout(resolve, delay)); + } + } + + throw new Error('Max retries exceeded'); +} + +// Usage: +const response = await fetchWithRetry('https://api.partner.com/data', { + signal: controller.signal +}); +``` + +--- + +### Fix 4: Circuit Breaker Pattern + +**Prevent Cascading Failures**: +```typescript +// utils/circuit-breaker.ts +class CircuitBreaker { + private failures: number = 0; + private lastFailureTime: number = 0; + private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED'; + + async execute(fn: () => Promise): Promise { + if (this.state === 'OPEN') { + // Check if enough time passed to try again + if (Date.now() - this.lastFailureTime > 60000) { + this.state = 'HALF_OPEN'; + } else { + throw new Error('Circuit breaker is OPEN'); + } + } + + try { + const result = await fn(); + this.onSuccess(); + return result; + } catch (error) { + this.onFailure(); + throw error; + } + } + + private onSuccess() { + this.failures = 0; + this.state = 'CLOSED'; + } + + private onFailure() { + this.failures++; + this.lastFailureTime = Date.now(); + + if (this.failures >= 5) { + this.state = 'OPEN'; // Trip circuit after 5 failures + } + } +} + +// Usage: +const breaker = new CircuitBreaker(); +const response = await breaker.execute(() => + fetch('https://api.partner.com/data') +); +``` + +--- + +## Results + +### Before vs After Metrics + +| Metric | Before Fix | After Fix | Improvement | +|--------|-----------|-----------|-------------| +| **504 Error Rate** | 5% | 0.01% | **99.8% reduction** | +| **DNS Resolution** | 3000ms (worst) | <5ms (cached) | **99.8% faster** | +| **Total Request Time** | 4800ms (p95) | 850ms (p95) | **82% faster** | +| **Timeout Threshold** | 5s (too low) | 30s (appropriate) | +500% headroom | + +--- + +### Network Diagnostics + +**traceroute Analysis**: +```bash +# Check network path to external API +traceroute api.partner.com + +# Results show no packet loss + 1 gateway (10.0.0.1) 1.234 ms + 2 isp-router (100.64.0.1) 5.678 ms +... +15 api.partner.com (203.0.113.42) 45.234 ms +``` + +**No packet loss** - confirms DNS was the issue, not network. + +--- + +## Prevention Measures + +### 1. Network Monitoring Dashboard + +**Metrics to Track**: +```typescript +// Track network timing metrics +const network_dns_duration = new Histogram({ + name: 'network_dns_duration_seconds', + help: 'DNS resolution time' +}); + +const network_connect_duration = new Histogram({ + name: 'network_connect_duration_seconds', + help: 'TCP connection time' +}); + +const network_total_duration = new Histogram({ + name: 'network_total_duration_seconds', + help: 'Total request time' +}); +``` + +### 2. Alert Rules + +```yaml +# Alert on high DNS resolution time +- alert: SlowDnsResolution + expr: histogram_quantile(0.95, network_dns_duration_seconds) > 1 + for: 5m + annotations: + summary: "DNS resolution p95 >1s" + +# Alert on gateway timeouts +- alert: HighGatewayTimeouts + expr: rate(http_requests_total{status="504"}[5m]) > 0.01 + for: 5m + annotations: + summary: "504 error rate >1%" +``` + +### 3. Health Check Endpoints + +```typescript +@app.get("/health/network") +async function networkHealth() { + const checks = await Promise.all([ + checkDns('api.partner.com'), + checkConnectivity('https://api.partner.com/health'), + checkLatency('https://api.partner.com/ping') + ]); + + return { + status: checks.every(c => c.healthy) ? 'healthy' : 'degraded', + checks + }; +} +``` + +--- + +## Lessons Learned + +### What Went Well + +✅ Detailed network timing analysis pinpointed DNS +✅ DNS caching eliminated 99.8% of timeouts +✅ Circuit breaker prevents cascading failures + +### What Could Be Improved + +❌ No DNS monitoring before incident +❌ Timeout too aggressive without considering DNS +❌ No retry logic for transient failures + +### Key Takeaways + +1. **Always cache DNS** in workers (60s TTL minimum) +2. **Account for DNS time** when setting timeouts +3. **Add retry logic** with exponential backoff +4. **Implement circuit breakers** for external dependencies +5. **Monitor network timing** (DNS, connect, TLS, transfer) + +--- + +## Related Documentation + +- **Worker Deployment**: [cloudflare-worker-deployment-failure.md](cloudflare-worker-deployment-failure.md) +- **Database Issues**: [planetscale-connection-issues.md](planetscale-connection-issues.md) +- **Performance**: [performance-degradation-analysis.md](performance-degradation-analysis.md) +- **Runbooks**: [../reference/troubleshooting-runbooks.md](../reference/troubleshooting-runbooks.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/devops-troubleshooting/examples/performance-degradation-analysis.md b/skills/devops-troubleshooting/examples/performance-degradation-analysis.md new file mode 100644 index 0000000..50cf046 --- /dev/null +++ b/skills/devops-troubleshooting/examples/performance-degradation-analysis.md @@ -0,0 +1,413 @@ +# Performance Degradation Analysis + +Investigating API response time increase from 200ms to 2000ms, resolved through N+1 query elimination, caching, and index optimization. + +## Overview + +**Incident**: API response times degraded 10x (200ms → 2000ms) +**Impact**: User-facing slowness, timeout errors, poor UX +**Root Cause**: N+1 query problem + missing indexes + no caching +**Resolution**: Query optimization + indexes + Redis caching +**Status**: Resolved + +## Incident Timeline + +| Time | Event | Action | +|------|-------|--------| +| 08:00 | Slowness reports from users | Support tickets opened | +| 08:15 | Monitoring confirms degradation | p95 latency 2000ms | +| 08:30 | Database profiling started | Slow query log analysis | +| 09:00 | N+1 query identified | Found 100+ queries per request | +| 09:30 | Fix implemented | Eager loading + indexes | +| 10:00 | Caching added | Redis for frequently accessed data | +| 10:30 | Deployment complete | Latency back to 200ms | + +--- + +## Symptoms and Detection + +### Initial Metrics + +**Latency Increase**: +``` +p50: 180ms → 1800ms (+900% slower) +p95: 220ms → 2100ms (+854% slower) +p99: 450ms → 3500ms (+677% slower) + +Requests timing out: 5% (>3s timeout) +``` + +**User Impact**: +- Page load times: 5-10 seconds +- API timeouts: 5% of requests +- Support tickets: 47 in 1 hour +- User complaints: "App is unusable" + +--- + +## Diagnosis + +### Step 1: Application Performance Monitoring + +**Wrangler Tail Analysis**: +```bash +# Monitor worker requests in real-time +wrangler tail --format pretty + +# Output shows slow requests: +[2024-12-05 08:20:15] GET /api/orders - 2145ms + └─ database_query: 1950ms (90% of total time!) + └─ json_serialization: 150ms + └─ response_headers: 45ms + +# Red flag: Database taking 90% of request time +``` + +--- + +### Step 2: Database Query Analysis + +**PlanetScale Slow Query Log**: +```bash +# Enable and check slow queries +pscale database insights greyhaven-db main --slow-queries + +# Results: +Query: SELECT * FROM order_items WHERE order_id = ? +Calls: 157 times per request # ❌ N+1 query problem! +Avg time: 12ms per query +Total: 1884ms per request (12ms × 157) +``` + +**N+1 Query Pattern Identified**: +```python +# api/orders.py (BEFORE - N+1 Problem) +@router.get("/orders/{user_id}") +async def get_user_orders(user_id: int, session: Session = Depends(get_session)): + # Query 1: Get all orders for user + orders = session.exec( + select(Order).where(Order.user_id == user_id) + ).all() # Returns 157 orders + + # Query 2-158: Get items for EACH order (N+1!) + for order in orders: + order.items = session.exec( + select(OrderItem).where(OrderItem.order_id == order.id) + ).all() # 157 additional queries! + + return orders + +# Total queries: 1 + 157 = 158 queries per request +# Total time: 10ms + (157 × 12ms) = 1894ms +``` + +--- + +### Step 3: Database Index Analysis + +**Missing Indexes**: +```sql +-- Check existing indexes +SELECT indexname, indexdef +FROM pg_indexes +WHERE tablename = 'order_items'; + +-- Results: +-- Primary key on id (exists) ✅ +-- NO index on order_id ❌ (needed for WHERE clause) +-- NO index on user_id ❌ (needed for joins) + +-- Explain plan shows full table scan +EXPLAIN ANALYZE +SELECT * FROM order_items WHERE order_id = 123; + +-- Result: +Seq Scan on order_items (cost=0.00..1500.00 rows=1 width=100) (actual time=12.345..12.345 rows=5 loops=157) + Filter: (order_id = 123) + Rows Removed by Filter: 10000 + +-- Full table scan on 10K rows, 157 times = extremely slow! +``` + +--- + +## Resolution + +### Fix 1: Eliminate N+1 with Eager Loading + +**After - Single Query with Join**: +```python +# api/orders.py (AFTER - Eager Loading) +from sqlmodel import select +from sqlalchemy.orm import selectinload + +@router.get("/orders/{user_id}") +async def get_user_orders(user_id: int, session: Session = Depends(get_session)): + # ✅ Single query with eager loading + statement = ( + select(Order) + .where(Order.user_id == user_id) + .options(selectinload(Order.items)) # Eager load items + ) + + orders = session.exec(statement).all() + + return orders + +# Total queries: 2 (1 for orders, 1 for all items) +# Total time: 10ms + 25ms = 35ms (98% faster!) +``` + +**Query Comparison**: +``` +BEFORE (N+1): +- Query 1: SELECT * FROM orders WHERE user_id = 1 (10ms) +- Query 2-158: SELECT * FROM order_items WHERE order_id = ? (×157, 12ms each) +- Total: 1894ms + +AFTER (Eager Loading): +- Query 1: SELECT * FROM orders WHERE user_id = 1 (10ms) +- Query 2: SELECT * FROM order_items WHERE order_id IN (?, ?, ..., ?) (25ms) +- Total: 35ms (54x faster!) +``` + +--- + +### Fix 2: Add Database Indexes + +**Create Indexes**: +```sql +-- Index on order_id for faster lookups +CREATE INDEX idx_order_items_order_id ON order_items(order_id); + +-- Index on user_id for user queries +CREATE INDEX idx_orders_user_id ON orders(user_id); + +-- Index on created_at for time-based queries +CREATE INDEX idx_orders_created_at ON orders(created_at); + +-- Composite index for common filters +CREATE INDEX idx_orders_user_created ON orders(user_id, created_at DESC); +``` + +**Before/After EXPLAIN**: +```sql +-- BEFORE (no index): +EXPLAIN ANALYZE SELECT * FROM order_items WHERE order_id = 123; +Seq Scan (cost=0.00..1500.00) (actual time=12.345ms) + +-- AFTER (with index): +Index Scan using idx_order_items_order_id (cost=0.00..8.50) (actual time=0.045ms) + +-- 270x faster (12.345ms → 0.045ms) +``` + +--- + +### Fix 3: Implement Redis Caching + +**Cache Frequent Queries**: +```typescript +// cache.ts - Redis caching layer +import { Redis } from '@upstash/redis'; + +const redis = new Redis({ + url: env.UPSTASH_REDIS_URL, + token: env.UPSTASH_REDIS_TOKEN +}); + +async function getCachedOrders(userId: number) { + const cacheKey = `orders:user:${userId}`; + + // Check cache + const cached = await redis.get(cacheKey); + if (cached) { + return JSON.parse(cached); // Cache hit + } + + // Cache miss - query database + const orders = await fetchOrdersFromDb(userId); + + // Store in cache (5 minute TTL) + await redis.setex(cacheKey, 300, JSON.stringify(orders)); + + return orders; +} +``` + +**Cache Hit Rates**: +``` +Requests: 10,000 +Cache hits: 8,500 (85%) +Cache misses: 1,500 (15%) + +Avg latency with cache: +- Cache hit: 5ms (Redis) +- Cache miss: 35ms (database) +- Overall: (0.85 × 5) + (0.15 × 35) = 9.5ms +``` + +--- + +### Fix 4: Database Connection Pooling + +**Optimize Pool Settings**: +```python +# database.py - Tuned for performance +engine = create_engine( + database_url, + pool_size=50, # Increased from 20 + max_overflow=20, + pool_recycle=1800, # 30 minutes + pool_pre_ping=True, # Health check + echo=False, + connect_args={ + "server_settings": { + "statement_timeout": "30000", # 30s query timeout + "idle_in_transaction_session_timeout": "60000" # 60s idle + } + } +) +``` + +--- + +## Results + +### Performance Metrics + +| Metric | Before | After | Improvement | +|--------|--------|-------|-------------| +| **p50 Latency** | 1800ms | 180ms | **90% faster** | +| **p95 Latency** | 2100ms | 220ms | **90% faster** | +| **p99 Latency** | 3500ms | 450ms | **87% faster** | +| **Database Queries** | 158/request | 2/request | **99% reduction** | +| **Cache Hit Rate** | 0% | 85% | **85% hits** | +| **Timeout Errors** | 5% | 0% | **100% eliminated** | + +### Cost Impact + +**Database Query Reduction**: +``` +Before: 158 queries × 100 req/s = 15,800 queries/s +After: 2 queries × 100 req/s = 200 queries/s + +Reduction: 98.7% fewer queries +Cost savings: $450/month (reduced database tier) +``` + +--- + +## Prevention Measures + +### 1. Query Performance Monitoring + +**Slow Query Alert**: +```yaml +# Alert on slow database queries +- alert: SlowDatabaseQueries + expr: histogram_quantile(0.95, rate(database_query_duration_seconds[5m])) > 0.1 + for: 5m + annotations: + summary: "Database queries p95 >100ms" +``` + +### 2. N+1 Query Detection + +**Test for N+1 Patterns**: +```python +# tests/test_n_plus_one.py +import pytest +from sqlalchemy import event +from database import engine + +@pytest.fixture +def query_counter(): + """Count SQL queries during test""" + queries = [] + + def before_cursor_execute(conn, cursor, statement, parameters, context, executemany): + queries.append(statement) + + event.listen(engine, "before_cursor_execute", before_cursor_execute) + yield queries + event.remove(engine, "before_cursor_execute", before_cursor_execute) + +def test_get_user_orders_no_n_plus_one(query_counter): + """Verify endpoint doesn't have N+1 queries""" + get_user_orders(user_id=1) + + # Should be 2 queries max (orders + items) + assert len(query_counter) <= 2, f"N+1 detected: {len(query_counter)} queries" +``` + +### 3. Database Index Coverage + +```sql +-- Check for missing indexes +SELECT + schemaname, + tablename, + attname, + n_distinct, + correlation +FROM pg_stats +WHERE schemaname = 'public' + AND n_distinct > 100 -- Cardinality suggests index needed +ORDER BY tablename, attname; +``` + +### 4. Performance Budget + +```typescript +// Set performance budgets +const PERFORMANCE_BUDGETS = { + api_latency_p95: 500, // ms + database_queries_per_request: 5, + cache_hit_rate_min: 0.70, // 70% +}; + +// CI/CD check +if (metrics.api_latency_p95 > PERFORMANCE_BUDGETS.api_latency_p95) { + throw new Error(`Performance budget exceeded: ${metrics.api_latency_p95}ms > 500ms`); +} +``` + +--- + +## Lessons Learned + +### What Went Well + +✅ Slow query log pinpointed N+1 problem +✅ Eager loading eliminated 99% of queries +✅ Indexes provided 270x speedup +✅ Caching reduced load by 85% + +### What Could Be Improved + +❌ No N+1 query detection before production +❌ Missing indexes not caught in code review +❌ No caching layer initially +❌ No query performance monitoring + +### Key Takeaways + +1. **Always use eager loading** for associations +2. **Add indexes** for all foreign keys and WHERE clauses +3. **Implement caching** for frequently accessed data +4. **Monitor query counts** per request (alert on >10) +5. **Test for N+1** in CI/CD pipeline + +--- + +## Related Documentation + +- **Worker Deployment**: [cloudflare-worker-deployment-failure.md](cloudflare-worker-deployment-failure.md) +- **Database Issues**: [planetscale-connection-issues.md](planetscale-connection-issues.md) +- **Network Debugging**: [distributed-system-debugging.md](distributed-system-debugging.md) +- **Runbooks**: [../reference/troubleshooting-runbooks.md](../reference/troubleshooting-runbooks.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/devops-troubleshooting/examples/planetscale-connection-issues.md b/skills/devops-troubleshooting/examples/planetscale-connection-issues.md new file mode 100644 index 0000000..3d477c6 --- /dev/null +++ b/skills/devops-troubleshooting/examples/planetscale-connection-issues.md @@ -0,0 +1,499 @@ +# PlanetScale Connection Pool Exhaustion + +Complete investigation of database connection pool exhaustion causing 503 errors, resolved through connection pool tuning and leak fixes. + +## Overview + +**Incident**: Database connection timeouts causing 15% request failure rate +**Impact**: Customer-facing 503 errors, support tickets increasing +**Root Cause**: Connection pool too small + unclosed connections in error paths +**Resolution**: Pool tuning (20→50) + connection leak fixes +**Status**: Resolved + +## Incident Timeline + +| Time | Event | Action | +|------|-------|--------| +| 09:30 | Alerts: High 503 error rate | Oncall paged | +| 09:35 | Investigation started | Check logs, metrics | +| 09:45 | Database connections at 100% | Identified pool exhaustion | +| 10:00 | Temporary fix: restart service | Bought time for root cause | +| 10:30 | Code analysis complete | Found connection leaks | +| 11:00 | Fix deployed (pool + leaks) | Production deployment | +| 11:30 | Monitoring confirmed stable | Incident resolved | + +--- + +## Symptoms and Detection + +### Initial Alerts + +**Prometheus Alert**: +```yaml +# Alert: HighErrorRate +expr: rate(http_requests_total{status="503"}[5m]) > 0.05 +for: 5m +annotations: + summary: "503 error rate >5% for 5 minutes" + description: "Current rate: {{ $value | humanizePercentage }}" +``` + +**Error Logs**: +``` +[ERROR] Database query failed: connection timeout +[ERROR] Pool exhausted, waiting for available connection +[ERROR] Request timeout after 30s waiting for DB connection +``` + +**Impact Metrics**: +``` +Error rate: 15% (150 failures per 1000 requests) +User complaints: 23 support tickets in 30 minutes +Failed transactions: ~$15,000 in abandoned carts +``` + +--- + +## Diagnosis + +### Step 1: Check Connection Pool Status + +**Query PlanetScale**: +```bash +# Connect to database +pscale shell greyhaven-db main + +# Check active connections +SELECT + COUNT(*) as active_connections, + MAX(pg_stat_activity.query_start) as oldest_query +FROM pg_stat_activity +WHERE state = 'active'; + +# Result: +# active_connections: 98 +# oldest_query: 2024-12-05 09:15:23 (15 minutes ago!) +``` + +**Check Application Pool**: +```python +# In FastAPI app - add diagnostic endpoint +from sqlmodel import Session +from database import engine + +@app.get("/pool-status") +def pool_status(): + pool = engine.pool + return { + "size": pool.size(), + "checked_out": pool.checkedout(), + "overflow": pool.overflow(), + "timeout": pool._timeout, + "max_overflow": pool._max_overflow + } + +# Response: +{ + "size": 20, + "checked_out": 20, # Pool exhausted! + "overflow": 0, + "timeout": 30, + "max_overflow": 10 +} +``` + +**Red Flags**: +- ✅ Pool at 100% capacity (20/20 connections checked out) +- ✅ No overflow connections being used (0/10) +- ✅ Connections held for >15 minutes +- ✅ New requests timing out waiting for connections + +--- + +### Step 2: Identify Connection Leaks + +**Code Review - Found Vulnerable Pattern**: +```python +# api/orders.py (BEFORE - LEAK) +from fastapi import APIRouter +from sqlmodel import Session, select +from database import engine + +router = APIRouter() + +@router.post("/orders") +async def create_order(order_data: OrderCreate): + # ❌ LEAK: Session never closed on exception + session = Session(engine) + + # Create order + order = Order(**order_data.dict()) + session.add(order) + session.commit() + + # If exception here, session never closed! + if order.total > 10000: + raise ValueError("Order exceeds limit") + + # session.close() never reached + return order +``` + +**How Leak Occurs**: +1. Request creates session (acquires connection from pool) +2. Exception raised after commit +3. Function exits without calling `session.close()` +4. Connection remains "checked out" from pool +5. After 20 such exceptions, pool exhausted + +--- + +### Step 3: Load Testing to Reproduce + +**Test Script**: +```python +# test_connection_leak.py +import asyncio +import httpx + +async def create_order(client, amount): + """Create order that will trigger exception""" + try: + response = await client.post( + "https://api.greyhaven.io/orders", + json={"total": amount} + ) + return response.status_code + except Exception: + return 503 + +async def load_test(): + """Simulate 100 orders with high amounts (triggers leak)""" + async with httpx.AsyncClient() as client: + # Trigger 100 exceptions (leak 100 connections) + tasks = [create_order(client, 15000) for _ in range(100)] + results = await asyncio.gather(*tasks) + + success = sum(1 for r in results if r == 201) + errors = sum(1 for r in results if r == 503) + + print(f"Success: {success}, Errors: {errors}") + +asyncio.run(load_test()) +``` + +**Results**: +``` +Success: 20 (first 20 use all connections) +Errors: 80 (remaining 80 timeout waiting for pool) + +Proves: Connection leak exhausts pool +``` + +--- + +## Resolution + +### Fix 1: Use Context Manager (Guaranteed Cleanup) + +**After - With Context Manager**: +```python +# api/orders.py (AFTER - FIXED) +from fastapi import APIRouter, Depends +from sqlmodel import Session +from database import get_session + +router = APIRouter() + +# ✅ Dependency injection with automatic cleanup +def get_session(): + with Session(engine) as session: + yield session + # Session always closed (even on exception) + +@router.post("/orders") +async def create_order( + order_data: OrderCreate, + session: Session = Depends(get_session) +): + # Session managed by FastAPI dependency + order = Order(**order_data.dict()) + session.add(order) + session.commit() + + # Exception here? No problem - session still closed by context manager + if order.total > 10000: + raise ValueError("Order exceeds limit") + + return order +``` + +**Why This Works**: +- Context manager (`with` statement) guarantees `session.close()` in `__exit__` +- Works even if exception raised +- FastAPI `Depends()` handles async cleanup automatically + +--- + +### Fix 2: Increase Connection Pool Size + +**Before** (pool too small): +```python +# database.py (BEFORE) +from sqlmodel import create_engine + +engine = create_engine( + database_url, + pool_size=20, # Too small for load + max_overflow=10, + pool_timeout=30 +) +``` + +**After** (tuned for load): +```python +# database.py (AFTER) +from sqlmodel import create_engine +import os + +# Calculate pool size based on workers +# Formula: (workers * 2) + buffer +# 16 workers * 2 + 20 buffer = 52 +workers = int(os.getenv("WEB_CONCURRENCY", 16)) +pool_size = (workers * 2) + 20 + +engine = create_engine( + database_url, + pool_size=pool_size, # 52 connections + max_overflow=20, # Burst to 72 total + pool_timeout=30, + pool_recycle=3600, # Recycle after 1 hour + pool_pre_ping=True, # Verify connection health + echo=False +) +``` + +**Pool Size Calculation**: +``` +Workers: 16 (Uvicorn workers) +Connections per worker: 2 (normal peak) +Buffer: 20 (for spikes) + +pool_size = (16 * 2) + 20 = 52 +max_overflow = 20 (total 72 for extreme spikes) +``` + +--- + +### Fix 3: Add Connection Pool Monitoring + +**Prometheus Metrics**: +```python +# monitoring.py +from prometheus_client import Gauge +from database import engine + +# Pool metrics +db_pool_size = Gauge('db_pool_size_total', 'Total pool size') +db_pool_checked_out = Gauge('db_pool_checked_out', 'Connections in use') +db_pool_idle = Gauge('db_pool_idle', 'Idle connections') +db_pool_overflow = Gauge('db_pool_overflow', 'Overflow connections') + +def update_pool_metrics(): + """Update pool metrics every 10 seconds""" + pool = engine.pool + db_pool_size.set(pool.size()) + db_pool_checked_out.set(pool.checkedout()) + db_pool_idle.set(pool.size() - pool.checkedout()) + db_pool_overflow.set(pool.overflow()) + +# Schedule in background task +import asyncio +async def pool_monitor(): + while True: + update_pool_metrics() + await asyncio.sleep(10) +``` + +**Grafana Alert**: +```yaml +# Alert: Connection pool near exhaustion +expr: db_pool_checked_out / db_pool_size_total > 0.8 +for: 5m +annotations: + summary: "Connection pool >80% utilized" + description: "{{ $value | humanizePercentage }} of pool in use" +``` + +--- + +### Fix 4: Add Timeout and Retry Logic + +**Connection Timeout Handling**: +```python +# database.py - Add connection retry +from tenacity import retry, stop_after_attempt, wait_exponential + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=1, max=10) +) +def get_session_with_retry(): + """Get session with automatic retry on pool timeout""" + try: + with Session(engine) as session: + yield session + except TimeoutError: + # Pool exhausted - retry after exponential backoff + raise + +@router.post("/orders") +async def create_order( + order_data: OrderCreate, + session: Session = Depends(get_session_with_retry) +): + # Will retry up to 3 times if pool exhausted + ... +``` + +--- + +## Results + +### Before vs After Metrics + +| Metric | Before Fix | After Fix | Improvement | +|--------|-----------|-----------|-------------| +| **Connection Pool Size** | 20 | 52 | +160% capacity | +| **Pool Utilization** | 100% (exhausted) | 40-60% (healthy) | -40% utilization | +| **503 Error Rate** | 15% | 0.01% | **99.9% reduction** | +| **Request Timeout** | 30s (waiting) | <100ms | **99.7% faster** | +| **Leaked Connections** | 12/hour | 0/day | **100% eliminated** | + +--- + +### Deployment Verification + +**Load Test After Fix**: +```bash +# Simulate 1000 concurrent orders +ab -n 1000 -c 50 -p order.json https://api.greyhaven.io/orders + +# Results: +Requests per second: 250 [#/sec] +Time per request: 200ms [mean] +Failed requests: 0 (0%) +Successful requests: 1000 (100%) + +# Pool status during test: +{ + "size": 52, + "checked_out": 28, # 54% utilization (healthy) + "overflow": 0, + "idle": 24 +} +``` + +--- + +## Prevention Measures + +### 1. Connection Leak Tests + +```python +# tests/test_connection_leaks.py +@pytest.fixture +def track_connections(): + before = engine.pool.checkedout() + yield + after = engine.pool.checkedout() + assert after == before, f"Leaked {after - before} connections" +``` + +### 2. Pool Alerts + +```yaml +# Alert if pool >80% for 5 minutes +expr: db_pool_checked_out / db_pool_size_total > 0.8 +``` + +### 3. Health Check + +```python +@app.get("/health/database") +async def database_health(): + with Session(engine) as session: + session.execute("SELECT 1") + return {"status": "healthy", "pool_utilization": pool.checkedout() / pool.size()} +``` + +### 4. Monitoring Commands + +```bash +# Active connections +pscale shell db main --execute "SELECT COUNT(*) FROM pg_stat_activity WHERE state='active'" + +# Slow queries +pscale database insights db main --slow-queries +``` + +--- + +## Lessons Learned + +### What Went Well + +✅ Quick identification of pool exhaustion (Prometheus alerts) +✅ Context manager pattern eliminated leaks +✅ Pool tuning based on formula (workers * 2 + buffer) +✅ Comprehensive monitoring added + +### What Could Be Improved + +❌ No pool monitoring before incident +❌ Pool size not calculated based on load +❌ Missing connection leak tests + +### Key Takeaways + +1. **Always use context managers** for database sessions +2. **Calculate pool size** based on workers and load +3. **Monitor pool utilization** with alerts at 80% +4. **Test for connection leaks** in CI/CD +5. **Add retry logic** for transient pool timeouts + +--- + +## PlanetScale Best Practices + +```bash +# Connection string with SSL +DATABASE_URL="postgresql://user:pass@aws.connect.psdb.cloud/db?sslmode=require" + +# Schema changes via deploy requests +pscale deploy-request create db schema-update + +# Test in branch +pscale branch create db test-feature +``` + +```sql +-- Index frequently queried columns +CREATE INDEX idx_orders_user_id ON orders(user_id); + +-- Analyze slow queries +EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 123; +``` + +--- + +## Related Documentation + +- **Worker Deployment**: [cloudflare-worker-deployment-failure.md](cloudflare-worker-deployment-failure.md) +- **Network Debugging**: [distributed-system-debugging.md](distributed-system-debugging.md) +- **Performance**: [performance-degradation-analysis.md](performance-degradation-analysis.md) +- **Runbooks**: [../reference/troubleshooting-runbooks.md](../reference/troubleshooting-runbooks.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/devops-troubleshooting/reference/INDEX.md b/skills/devops-troubleshooting/reference/INDEX.md new file mode 100644 index 0000000..8e2c2b7 --- /dev/null +++ b/skills/devops-troubleshooting/reference/INDEX.md @@ -0,0 +1,72 @@ +# DevOps Troubleshooter Reference + +Quick reference guides for Grey Haven infrastructure troubleshooting - runbooks, diagnostic commands, and platform-specific guides. + +## Reference Guides + +### Troubleshooting Runbooks + +**File**: [troubleshooting-runbooks.md](troubleshooting-runbooks.md) + +Step-by-step runbooks for common infrastructure issues: +- **Worker Not Responding**: 500/502/503 errors from Cloudflare Workers +- **Database Connection Failures**: Connection refused, pool exhaustion +- **Deployment Failures**: Failed deployments, rollback procedures +- **Performance Degradation**: Slow responses, high latency +- **Network Issues**: DNS failures, connectivity problems + +**Use when**: Following structured resolution for known issues + +--- + +### Diagnostic Commands Reference + +**File**: [diagnostic-commands.md](diagnostic-commands.md) + +Command reference for quick troubleshooting: +- **Cloudflare Workers**: wrangler commands, log analysis +- **PlanetScale**: Database queries, connection checks +- **Network**: curl timing, DNS resolution, traceroute +- **Performance**: Profiling, metrics collection + +**Use when**: Need quick command syntax for diagnostics + +--- + +### Cloudflare Workers Platform Guide + +**File**: [cloudflare-workers-guide.md](cloudflare-workers-guide.md) + +Cloudflare Workers-specific guidance: +- **Deployment Best Practices**: Bundle size, environment variables +- **Performance Optimization**: CPU limits, memory management +- **Error Handling**: Common errors and solutions +- **Monitoring**: Logs, metrics, analytics + +**Use when**: Cloudflare Workers-specific issues + +--- + +## Quick Navigation + +**By Issue Type**: +- Worker errors → [troubleshooting-runbooks.md#worker-not-responding](troubleshooting-runbooks.md#worker-not-responding) +- Database issues → [troubleshooting-runbooks.md#database-connection-failures](troubleshooting-runbooks.md#database-connection-failures) +- Performance → [troubleshooting-runbooks.md#performance-degradation](troubleshooting-runbooks.md#performance-degradation) + +**By Platform**: +- Cloudflare Workers → [cloudflare-workers-guide.md](cloudflare-workers-guide.md) +- PlanetScale → [diagnostic-commands.md#planetscale-commands](diagnostic-commands.md#planetscale-commands) +- Network → [diagnostic-commands.md#network-commands](diagnostic-commands.md#network-commands) + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Full troubleshooting walkthroughs +- **Templates**: [Templates Index](../templates/INDEX.md) - Incident report templates +- **Main Agent**: [devops-troubleshooter.md](../devops-troubleshooter.md) - DevOps troubleshooter agent + +--- + +Return to [main agent](../devops-troubleshooter.md) diff --git a/skills/devops-troubleshooting/reference/cloudflare-workers-guide.md b/skills/devops-troubleshooting/reference/cloudflare-workers-guide.md new file mode 100644 index 0000000..c569bab --- /dev/null +++ b/skills/devops-troubleshooting/reference/cloudflare-workers-guide.md @@ -0,0 +1,472 @@ +# Cloudflare Workers Platform Guide + +Comprehensive guide for deploying, monitoring, and troubleshooting Cloudflare Workers in Grey Haven's stack. + +## Workers Architecture + +**Execution Model**: +- V8 isolates (not containers) +- Deployed globally to 300+ datacenters +- Request routed to nearest location +- Cold start: ~1-5ms (vs 100-1000ms for containers) +- CPU time limit: 50ms (Free), 50ms-30s (Paid) + +**Resource Limits**: +``` +Free Plan: +- Bundle size: 1MB compressed +- CPU time: 50ms per request +- Requests: 100,000/day +- KV reads: 100,000/day + +Paid Plan ($5/month): +- Bundle size: 10MB compressed +- CPU time: 50ms (standard), up to 30s (unbound) +- Requests: 10M included, $0.50/million after +- KV reads: 10M included +``` + +--- + +## Deployment Best Practices + +### Bundle Optimization + +**Size Reduction Strategies**: +```typescript +// 1. Tree shaking with named imports +import { uniq } from 'lodash-es'; // ✅ Only imports uniq +import _ from 'lodash'; // ❌ Imports entire library + +// 2. Use native APIs instead of libraries +const date = new Date().toISOString(); // ✅ Native +import moment from 'moment'; // ❌ 300KB library + +// 3. External API calls instead of SDKs +await fetch('https://api.anthropic.com/v1/messages', { + method: 'POST', + headers: { 'x-api-key': env.API_KEY }, + body: JSON.stringify({ ... }) +}); // ✅ 0KB vs @anthropic-ai/sdk (2.1MB) + +// 4. Code splitting with dynamic imports +if (request.url.includes('/special')) { + const { handler } = await import('./expensive-module'); + return handler(request); +} // ✅ Lazy load +``` + +**webpack Configuration**: +```javascript +module.exports = { + mode: 'production', + target: 'webworker', + optimization: { + minimize: true, + usedExports: true, // Tree shaking + sideEffects: false + }, + resolve: { + alias: { + 'lodash': 'lodash-es' // Use ES modules version + } + } +}; +``` + +--- + +### Environment Variables + +**Using Secrets**: +```bash +# Add secret (never in code) +wrangler secret put DATABASE_URL + +# List secrets +wrangler secret list + +# Delete secret +wrangler secret delete OLD_KEY +``` + +**Using Variables** (wrangler.toml): +```toml +[vars] +API_ENDPOINT = "https://api.partner.com" +MAX_RETRIES = "3" +CACHE_TTL = "300" + +[env.staging.vars] +API_ENDPOINT = "https://staging-api.partner.com" + +[env.production.vars] +API_ENDPOINT = "https://api.partner.com" +``` + +**Accessing in Code**: +```typescript +export default { + async fetch(request: Request, env: Env) { + const dbUrl = env.DATABASE_URL; // Secret + const endpoint = env.API_ENDPOINT; // Var + const maxRetries = parseInt(env.MAX_RETRIES); + + return new Response('OK'); + } +}; +``` + +--- + +## Performance Optimization + +### CPU Time Management + +**Avoid CPU-Intensive Operations**: +```typescript +// ❌ BAD: CPU-intensive operation +function processLargeDataset(data) { + const sorted = data.sort((a, b) => a.value - b.value); + const filtered = sorted.filter(item => item.value > 1000); + const mapped = filtered.map(item => ({ ...item, processed: true })); + return mapped; // Can exceed 50ms CPU limit +} + +// ✅ GOOD: Offload to external service +async function processLargeDataset(data, env) { + const response = await fetch(`${env.PROCESSING_API}/process`, { + method: 'POST', + body: JSON.stringify(data) + }); + return response.json(); // External service handles heavy lifting +} + +// ✅ BETTER: Use Durable Objects for stateful computation +const id = env.PROCESSOR.idFromName('processor'); +const stub = env.PROCESSOR.get(id); +return stub.fetch(request); // Durable Object has more CPU time +``` + +**Monitor CPU Usage**: +```typescript +export default { + async fetch(request: Request, env: Env) { + const start = Date.now(); + + try { + const response = await handleRequest(request, env); + const duration = Date.now() - start; + + if (duration > 40) { + console.warn(`CPU time approaching limit: ${duration}ms`); + } + + return response; + } catch (error) { + const duration = Date.now() - start; + console.error(`Request failed after ${duration}ms:`, error); + throw error; + } + } +}; +``` + +--- + +### Caching Strategies + +**Cache API**: +```typescript +export default { + async fetch(request: Request) { + const cache = caches.default; + + // Check cache + let response = await cache.match(request); + if (response) return response; + + // Cache miss - fetch and cache + response = await fetch(request); + + // Cache for 5 minutes + const cacheResponse = new Response(response.body, response); + cacheResponse.headers.set('Cache-Control', 'max-age=300'); + await cache.put(request, cacheResponse.clone()); + + return response; + } +}; +``` + +**KV for Data Caching**: +```typescript +export default { + async fetch(request: Request, env: Env) { + const url = new URL(request.url); + const cacheKey = `data:${url.pathname}`; + + // Check KV + const cached = await env.CACHE.get(cacheKey, 'json'); + if (cached) return Response.json(cached); + + // Fetch data + const data = await fetchExpensiveData(); + + // Store in KV with 5min TTL + await env.CACHE.put(cacheKey, JSON.stringify(data), { + expirationTtl: 300 + }); + + return Response.json(data); + } +}; +``` + +--- + +## Common Errors and Solutions + +### Error 1101: Worker Threw Exception + +**Cause**: Unhandled JavaScript exception + +**Example**: +```typescript +// ❌ BAD: Unhandled error +export default { + async fetch(request: Request) { + const data = JSON.parse(request.body); // Throws if invalid JSON + return Response.json(data); + } +}; +``` + +**Solution**: +```typescript +// ✅ GOOD: Proper error handling +export default { + async fetch(request: Request) { + try { + const body = await request.text(); + const data = JSON.parse(body); + return Response.json(data); + } catch (error) { + console.error('JSON parse error:', error); + return new Response('Invalid JSON', { status: 400 }); + } + } +}; +``` + +--- + +### Error 1015: Rate Limited + +**Cause**: Too many requests to origin + +**Solution**: Implement caching and rate limiting +```typescript +const RATE_LIMIT = 100; // requests per minute +const rateLimits = new Map(); + +export default { + async fetch(request: Request) { + const ip = request.headers.get('CF-Connecting-IP'); + const key = `ratelimit:${ip}`; + + const count = rateLimits.get(key) || 0; + if (count >= RATE_LIMIT) { + return new Response('Rate limit exceeded', { status: 429 }); + } + + rateLimits.set(key, count + 1); + setTimeout(() => rateLimits.delete(key), 60000); + + return new Response('OK'); + } +}; +``` + +--- + +### Error: Script Exceeds Size Limit + +**Diagnosis**: +```bash +# Check bundle size +npm run build +ls -lh dist/worker.js + +# Analyze bundle +npm install --save-dev webpack-bundle-analyzer +npm run build -- --analyze +``` + +**Solutions**: See [bundle optimization](#bundle-optimization) above + +--- + +## Monitoring and Logging + +### Structured Logging + +```typescript +interface LogEntry { + level: 'info' | 'warn' | 'error'; + message: string; + timestamp: string; + requestId?: string; + duration?: number; + metadata?: Record; +} + +function log(entry: LogEntry) { + console.log(JSON.stringify({ + ...entry, + timestamp: new Date().toISOString() + })); +} + +export default { + async fetch(request: Request, env: Env) { + const requestId = crypto.randomUUID(); + const start = Date.now(); + + try { + log({ + level: 'info', + message: 'Request started', + requestId, + metadata: { + method: request.method, + url: request.url + } + }); + + const response = await handleRequest(request, env); + + log({ + level: 'info', + message: 'Request completed', + requestId, + duration: Date.now() - start, + metadata: { + status: response.status + } + }); + + return response; + } catch (error) { + log({ + level: 'error', + message: 'Request failed', + requestId, + duration: Date.now() - start, + metadata: { + error: error.message, + stack: error.stack + } + }); + + return new Response('Internal Server Error', { status: 500 }); + } + } +}; +``` + +--- + +### Health Check Endpoint + +```typescript +export default { + async fetch(request: Request, env: Env) { + const url = new URL(request.url); + + if (url.pathname === '/health') { + return Response.json({ + status: 'healthy', + timestamp: new Date().toISOString(), + version: env.VERSION || 'unknown' + }); + } + + // Regular request handling + return handleRequest(request, env); + } +}; +``` + +--- + +## Testing Workers + +```bash +# Local testing +wrangler dev +curl http://localhost:8787/api/users +curl -X POST http://localhost:8787/api/users -H "Content-Type: application/json" -d '{"name": "Test User"}' + +# Unit testing (Vitest) +import { describe, it, expect } from 'vitest'; +import worker from './worker'; + +describe('Worker', () => { + it('returns 200 for health check', async () => { + const request = new Request('https://example.com/health'); + const response = await worker.fetch(request, getMockEnv()); + expect(response.status).toBe(200); + }); +}); +``` + +--- + +## Security Best Practices + +```typescript +// 1. Validate inputs +function validateEmail(email: string): boolean { + return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email); +} + +// 2. Set security headers +function addSecurityHeaders(response: Response): Response { + response.headers.set('X-Content-Type-Options', 'nosniff'); + response.headers.set('X-Frame-Options', 'DENY'); + response.headers.set('Strict-Transport-Security', 'max-age=31536000'); + return response; +} + +// 3. CORS configuration +const ALLOWED_ORIGINS = ['https://app.greyhaven.io', 'https://staging.greyhaven.io']; +function handleCors(request: Request): Response | null { + const origin = request.headers.get('Origin'); + if (request.method === 'OPTIONS') { + return new Response(null, { + headers: { + 'Access-Control-Allow-Origin': origin, + 'Access-Control-Allow-Methods': 'GET,POST,PUT,DELETE', + 'Access-Control-Max-Age': '86400' + } + }); + } + if (origin && !ALLOWED_ORIGINS.includes(origin)) { + return new Response('Forbidden', { status: 403 }); + } + return null; +} +``` + +--- + +## Related Documentation + +- **Runbooks**: [troubleshooting-runbooks.md](troubleshooting-runbooks.md) - Step-by-step procedures +- **Commands**: [diagnostic-commands.md](diagnostic-commands.md) - Command reference +- **Examples**: [Examples Index](../examples/INDEX.md) - Full examples + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/devops-troubleshooting/reference/diagnostic-commands.md b/skills/devops-troubleshooting/reference/diagnostic-commands.md new file mode 100644 index 0000000..596828a --- /dev/null +++ b/skills/devops-troubleshooting/reference/diagnostic-commands.md @@ -0,0 +1,473 @@ +# Diagnostic Commands Reference + +Quick command reference for Grey Haven infrastructure troubleshooting. Copy-paste ready commands for rapid diagnosis. + +## Cloudflare Workers Commands + +### Deployment Management + +```bash +# List recent deployments +wrangler deployments list + +# View specific deployment +wrangler deployments view + +# Rollback to previous version +wrangler rollback --message "Reverting due to errors" + +# Deploy to production +wrangler deploy + +# Deploy to staging +wrangler deploy --env staging +``` + +### Logs and Monitoring + +```bash +# Real-time logs (pretty format) +wrangler tail --format pretty + +# JSON logs for parsing +wrangler tail --format json + +# Filter by status code +wrangler tail --format json | grep "\"status\":500" + +# Show only errors +wrangler tail --format json | grep -i "error" + +# Save logs to file +wrangler tail --format json > worker-logs.json + +# Monitor specific worker +wrangler tail --name my-worker +``` + +### Local Development + +```bash +# Start local dev server +wrangler dev + +# Dev with specific port +wrangler dev --port 8788 + +# Dev with remote mode (use production bindings) +wrangler dev --remote + +# Test locally +curl http://localhost:8787/api/health +``` + +### Configuration + +```bash +# Show account info +wrangler whoami + +# List KV namespaces +wrangler kv:namespace list + +# List secrets +wrangler secret list + +# Add secret +wrangler secret put API_KEY + +# Delete secret +wrangler secret delete API_KEY +``` + +--- + +## PlanetScale Commands + +### Database Management + +```bash +# Connect to database shell +pscale shell greyhaven-db main + +# Connect and execute query +pscale shell greyhaven-db main --execute "SELECT COUNT(*) FROM users" + +# Show database info +pscale database show greyhaven-db + +# List all databases +pscale database list + +# Create new branch +pscale branch create greyhaven-db feature-branch + +# List branches +pscale branch list greyhaven-db +``` + +### Connection Monitoring + +```sql +-- Active connections +SELECT COUNT(*) as active_connections +FROM pg_stat_activity +WHERE state = 'active'; + +-- Long-running queries +SELECT + pid, + now() - query_start as duration, + query +FROM pg_stat_activity +WHERE state = 'active' + AND query_start < now() - interval '10 seconds' +ORDER BY duration DESC; + +-- Connection by state +SELECT state, COUNT(*) +FROM pg_stat_activity +GROUP BY state; + +-- Blocked queries +SELECT + blocked.pid AS blocked_pid, + blocking.pid AS blocking_pid, + blocked.query AS blocked_query +FROM pg_stat_activity blocked +JOIN pg_stat_activity blocking + ON blocking.pid = ANY(pg_blocking_pids(blocked.pid)); +``` + +### Performance Analysis + +```bash +# Slow query insights +pscale database insights greyhaven-db main --slow-queries + +# Database size +pscale database show greyhaven-db --web + +# Enable slow query log +pscale database settings update greyhaven-db --enable-slow-query-log +``` + +```sql +-- Table sizes +SELECT + schemaname, + tablename, + pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size +FROM pg_tables +WHERE schemaname = 'public' +ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC; + +-- Index usage +SELECT + schemaname, + tablename, + indexname, + idx_scan as index_scans +FROM pg_stat_user_indexes +ORDER BY idx_scan ASC; + +-- Cache hit ratio +SELECT + 'cache hit rate' AS metric, + sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) AS ratio +FROM pg_statio_user_tables; +``` + +### Schema Migrations + +```bash +# Create deploy request +pscale deploy-request create greyhaven-db + +# List deploy requests +pscale deploy-request list greyhaven-db + +# View deploy request diff +pscale deploy-request diff greyhaven-db + +# Deploy schema changes +pscale deploy-request deploy greyhaven-db + +# Close deploy request +pscale deploy-request close greyhaven-db +``` + +--- + +## Network Diagnostic Commands + +### DNS Resolution + +```bash +# Basic DNS lookup +nslookup api.partner.com + +# Detailed DNS query +dig api.partner.com + +# Measure DNS time +time nslookup api.partner.com + +# Check DNS propagation +dig api.partner.com @8.8.8.8 +dig api.partner.com @1.1.1.1 + +# Reverse DNS lookup +dig -x 203.0.113.42 +``` + +### Connectivity Testing + +```bash +# Ping test +ping -c 10 api.partner.com + +# Trace network route +traceroute api.partner.com + +# TCP connection test +nc -zv api.partner.com 443 + +# Test specific port +telnet api.partner.com 443 +``` + +### HTTP Request Timing + +```bash +# Full timing breakdown +curl -w "\nDNS Lookup: %{time_namelookup}s\nTCP Connect: %{time_connect}s\nTLS Handshake: %{time_appconnect}s\nStart Transfer:%{time_starttransfer}s\nTotal: %{time_total}s\n" \ + -o /dev/null -s https://api.partner.com/data + +# Test with specific method +curl -X POST https://api.example.com/api \ + -H "Content-Type: application/json" \ + -d '{"test": "data"}' + +# Follow redirects +curl -L https://example.com + +# Show response headers +curl -I https://api.example.com + +# Test CORS +curl -I -X OPTIONS https://api.example.com \ + -H "Origin: https://app.example.com" \ + -H "Access-Control-Request-Method: POST" +``` + +### SSL/TLS Verification + +```bash +# Check SSL certificate +openssl s_client -connect api.example.com:443 + +# Show certificate expiry +echo | openssl s_client -connect api.example.com:443 2>/dev/null | \ + openssl x509 -noout -dates + +# Verify certificate chain +openssl s_client -connect api.example.com:443 -showcerts +``` + +--- + +## Application Performance Commands + +### Resource Monitoring + +```bash +# CPU usage +top -o cpu + +# Memory usage +free -h # Linux +vm_stat # macOS + +# Disk usage +df -h + +# Process list +ps aux | grep node + +# Port usage +lsof -i :8000 +netstat -an | grep 8000 +``` + +### Log Analysis + +```bash +# Tail logs +tail -f /var/log/app.log + +# Search logs +grep -i "error" /var/log/app.log + +# Count errors +grep -c "ERROR" /var/log/app.log + +# Show recent errors with context +grep -B 5 -A 5 "error" /var/log/app.log + +# Parse JSON logs +cat app.log | jq 'select(.level=="error")' + +# Error frequency +grep "ERROR" /var/log/app.log | cut -d' ' -f1 | uniq -c +``` + +### Worker Performance + +```bash +# Monitor CPU time +wrangler tail --format json | jq '.outcome.cpuTime' + +# Monitor duration +wrangler tail --format json | jq '.outcome.duration' + +# Requests per second +wrangler tail --format json | wc -l + +# Average response time +wrangler tail --format json | \ + jq -r '.outcome.duration' | \ + awk '{sum+=$1; count++} END {print sum/count}' +``` + +--- + +## Health Check Scripts + +### Worker Health Check + +```bash +#!/bin/bash +# health-check-worker.sh + +echo "=== Worker Health Check ===" + +# Test endpoint +STATUS=$(curl -s -o /dev/null -w "%{http_code}" https://api.greyhaven.io/health) + +if [ "$STATUS" -eq 200 ]; then + echo "✅ Worker responding (HTTP $STATUS)" +else + echo "❌ Worker error (HTTP $STATUS)" + exit 1 +fi + +# Check response time +TIME=$(curl -w "%{time_total}" -o /dev/null -s https://api.greyhaven.io/health) +echo "Response time: ${TIME}s" + +if (( $(echo "$TIME > 1.0" | bc -l) )); then + echo "⚠️ Slow response (>${TIME}s)" +fi +``` + +### Database Health Check + +```bash +#!/bin/bash +# health-check-db.sh + +echo "=== Database Health Check ===" + +# Test connection +pscale shell greyhaven-db main --execute "SELECT 1" > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + echo "✅ Database connection OK" +else + echo "❌ Database connection failed" + exit 1 +fi + +# Check active connections +ACTIVE=$(pscale shell greyhaven-db main --execute \ + "SELECT COUNT(*) FROM pg_stat_activity WHERE state='active'" | tail -1) + +echo "Active connections: $ACTIVE" + +if [ "$ACTIVE" -gt 80 ]; then + echo "⚠️ High connection count (>80)" +fi +``` + +### Complete System Health + +```bash +#!/bin/bash +# health-check-all.sh + +echo "=== Complete System Health Check ===" + +# Worker +echo "\n1. Cloudflare Worker" +./health-check-worker.sh + +# Database +echo "\n2. PlanetScale Database" +./health-check-db.sh + +# External APIs +echo "\n3. External Dependencies" +for API in "https://api.partner1.com/health" "https://api.partner2.com/health"; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$API") + if [ "$STATUS" -eq 200 ]; then + echo "✅ $API (HTTP $STATUS)" + else + echo "❌ $API (HTTP $STATUS)" + fi +done + +echo "\n=== Health Check Complete ===" +``` + +--- + +## Troubleshooting One-Liners + +```bash +# Find memory hogs +ps aux --sort=-%mem | head -10 + +# Find CPU hogs +ps aux --sort=-%cpu | head -10 + +# Disk space by directory +du -sh /* | sort -h + +# Network connections +netstat -ant | awk '{print $6}' | sort | uniq -c + +# Failed login attempts +grep "Failed password" /var/log/auth.log | wc -l + +# Top error codes +awk '{print $9}' access.log | sort | uniq -c | sort -rn + +# Requests per minute +awk '{print $4}' access.log | cut -d: -f1-2 | uniq -c + +# Average response size +awk '{sum+=$10; count++} END {print sum/count}' access.log +``` + +--- + +## Related Documentation + +- **Runbooks**: [troubleshooting-runbooks.md](troubleshooting-runbooks.md) - Step-by-step procedures +- **Cloudflare Guide**: [cloudflare-workers-guide.md](cloudflare-workers-guide.md) - Platform-specific +- **Examples**: [Examples Index](../examples/INDEX.md) - Full troubleshooting examples + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/devops-troubleshooting/reference/troubleshooting-runbooks.md b/skills/devops-troubleshooting/reference/troubleshooting-runbooks.md new file mode 100644 index 0000000..eff4ae1 --- /dev/null +++ b/skills/devops-troubleshooting/reference/troubleshooting-runbooks.md @@ -0,0 +1,489 @@ +# Troubleshooting Runbooks + +Step-by-step runbooks for resolving common Grey Haven infrastructure issues. Follow procedures systematically for fastest resolution. + +## Runbook 1: Worker Not Responding + +### Symptoms +- API returning 500/502/503 errors +- Workers timing out or not processing requests +- Cloudflare error pages showing + +### Diagnosis Steps + +**1. Check Cloudflare Status** +```bash +# Visit: https://www.cloudflarestatus.com +# Or query status API +curl -s https://www.cloudflarestatus.com/api/v2/status.json | jq '.status.indicator' +``` + +**2. View Worker Logs** +```bash +# Real-time logs +wrangler tail --format pretty + +# Look for errors: +# - "Script exceeded CPU time limit" +# - "Worker threw exception" +# - "Uncaught TypeError" +``` + +**3. Check Recent Deployments** +```bash +wrangler deployments list + +# If recent deployment suspicious, rollback: +wrangler rollback --message "Reverting to stable version" +``` + +**4. Test Worker Locally** +```bash +# Run worker in dev mode +wrangler dev + +# Test endpoint +curl http://localhost:8787/api/health +``` + +### Resolution Paths + +**Path A: Platform Issue** - Wait for Cloudflare, monitor status, communicate ETA +**Path B: Code Error** - Rollback deployment, fix in dev, test before redeploy +**Path C: Resource Limit** - Check CPU logs, optimize operations, upgrade if needed +**Path D: Binding Issue** - Verify wrangler.toml, check bindings, redeploy + +### Prevention +- Health check endpoint: `GET /health` +- Monitor error rate with alerts (>1% = alert) +- Test deployments in staging first +- Implement circuit breakers for external calls + +--- + +## Runbook 2: Database Connection Failures + +### Symptoms +- "connection refused" errors +- "too many connections" errors +- Application timing out on database queries +- 503 errors from API + +### Diagnosis Steps + +**1. Test Database Connection** +```bash +# Direct connection test +pscale shell greyhaven-db main + +# If fails, check: +# - Database status +# - Credentials +# - Network connectivity +``` + +**2. Check Connection Pool** +```bash +# Query pool status +curl http://localhost:8000/pool-status + +# Expected healthy response: +{ + "size": 50, + "checked_out": 25, # <80% is healthy + "overflow": 0, + "available": 25 +} +``` + +**3. Check Active Connections** +```sql +-- In pscale shell +SELECT + COUNT(*) as active, + MAX(query_start) as oldest_query +FROM pg_stat_activity +WHERE state = 'active'; + +-- If active = pool size, pool exhausted +-- If oldest_query >10min, leaked connection +``` + +**4. Review Application Logs** +```bash +# Search for connection errors +grep -i "connection" logs/app.log | tail -50 + +# Common errors: +# - "Pool timeout" +# - "Connection refused" +# - "Max connections reached" +``` + +### Resolution Paths + +**Path A: Invalid Credentials** +```bash +# Rotate credentials +pscale password create greyhaven-db main app-password + +# Update environment variable +# Restart application +``` + +**Path B: Pool Exhausted** +```python +# Increase pool size in database.py +engine = create_engine( + database_url, + pool_size=50, # Increase from 20 + max_overflow=20 +) +``` + +**Path C: Connection Leaks** +```python +# Fix: Use context managers +with Session(engine) as session: + # Work with session + pass # Automatically closed +``` + +**Path D: Database Paused/Down** +```bash +# Resume database if paused +pscale database resume greyhaven-db + +# Check database status +pscale database show greyhaven-db +``` + +### Prevention +- Use connection pooling with proper limits +- Implement retry logic with exponential backoff +- Monitor pool utilization (alert >80%) +- Test for connection leaks in CI/CD + +--- + +## Runbook 3: Deployment Failures + +### Symptoms +- `wrangler deploy` fails +- CI/CD pipeline fails at deployment step +- New code not reflecting in production + +### Diagnosis Steps + +**1. Check Deployment Error** +```bash +wrangler deploy --verbose + +# Common errors: +# - "Script exceeds size limit" +# - "Syntax error in worker" +# - "Environment variable missing" +# - "Binding not found" +``` + +**2. Verify Build Output** +```bash +# Check built file +ls -lh dist/ +npm run build + +# Ensure build succeeds locally +``` + +**3. Check Environment Variables** +```bash +# List secrets +wrangler secret list + +# Verify wrangler.toml vars +cat wrangler.toml | grep -A 10 "\[vars\]" +``` + +**4. Test Locally** +```bash +# Start dev server +wrangler dev + +# If works locally but not production: +# - Environment variable mismatch +# - Binding configuration issue +``` + +### Resolution Paths + +**Path A: Bundle Too Large** +```bash +# Check bundle size +ls -lh dist/worker.js + +# Solutions: +# - Tree shake unused code +# - Code split large modules +# - Use fetch instead of SDK +``` + +**Path B: Syntax Error** +```bash +# Run TypeScript check +npm run type-check + +# Run linter +npm run lint + +# Fix errors before deploying +``` + +**Path C: Missing Variables** +```bash +# Add missing secret +wrangler secret put API_KEY + +# Or add to wrangler.toml vars +[vars] +API_ENDPOINT = "https://api.example.com" +``` + +**Path D: Binding Not Found** +```toml +# wrangler.toml - Add binding +[[kv_namespaces]] +binding = "CACHE" +id = "abc123" + +[[d1_databases]] +binding = "DB" +database_name = "greyhaven-db" +database_id = "xyz789" +``` + +### Prevention +- Bundle size check in CI/CD +- Pre-commit hooks for validation +- Staging environment for testing +- Automated deployment tests + +--- + +## Runbook 4: Performance Degradation + +### Symptoms +- API response times increased (>2x normal) +- Slow page loads +- User complaints about slowness +- Timeout errors + +### Diagnosis Steps + +**1. Check Current Latency** +```bash +# Test endpoint +curl -w "\nTotal: %{time_total}s\n" -o /dev/null -s https://api.greyhaven.io/orders + +# p95 should be <500ms +# If >1s, investigate +``` + +**2. Analyze Worker Logs** +```bash +wrangler tail --format json | jq '{duration: .outcome.duration, event: .event}' + +# Identify slow requests +# Check what's taking time +``` + +**3. Check Database Queries** +```bash +# Slow query log +pscale database insights greyhaven-db main --slow-queries + +# Look for: +# - N+1 queries (many small queries) +# - Missing indexes (full table scans) +# - Long-running queries (>100ms) +``` + +**4. Profile Application** +```bash +# Add timing middleware +# Log slow operations +# Identify bottleneck (DB, API, compute) +``` + +### Resolution Paths + +**Path A: N+1 Queries** +```python +# Use eager loading +statement = ( + select(Order) + .options(selectinload(Order.items)) +) +``` + +**Path B: Missing Indexes** +```sql +-- Add indexes +CREATE INDEX idx_orders_user_id ON orders(user_id); +CREATE INDEX idx_items_order_id ON order_items(order_id); +``` + +**Path C: No Caching** +```typescript +// Add Redis caching +const cached = await redis.get(cacheKey); +if (cached) return cached; + +const result = await expensiveOperation(); +await redis.setex(cacheKey, 300, result); +``` + +**Path D: Worker CPU Limit** +```typescript +// Optimize expensive operations +// Use async operations +// Offload to external service +``` + +### Prevention +- Monitor p95 latency (alert >500ms) +- Test for N+1 queries in CI/CD +- Add indexes for foreign keys +- Implement caching layer +- Performance budgets in tests + +--- + +## Runbook 5: Network Connectivity Issues + +### Symptoms +- Intermittent failures +- DNS resolution errors +- Connection timeouts +- CORS errors + +### Diagnosis Steps + +**1. Test DNS Resolution** +```bash +# Check DNS +nslookup api.partner.com +dig api.partner.com + +# Measure DNS time +time nslookup api.partner.com + +# If >1s, DNS is slow +``` + +**2. Test Connectivity** +```bash +# Basic connectivity +ping api.partner.com + +# Trace route +traceroute api.partner.com + +# Full timing breakdown +curl -w "\nDNS: %{time_namelookup}s\nConnect: %{time_connect}s\nTotal: %{time_total}s\n" \ + -o /dev/null -s https://api.partner.com +``` + +**3. Check CORS** +```bash +# Preflight request +curl -I -X OPTIONS https://api.greyhaven.io/api/users \ + -H "Origin: https://app.greyhaven.io" \ + -H "Access-Control-Request-Method: POST" + +# Verify headers: +# - Access-Control-Allow-Origin +# - Access-Control-Allow-Methods +``` + +**4. Check Firewall/Security** +```bash +# Test from different location +# Check IP whitelist +# Verify SSL certificate +``` + +### Resolution Paths + +**Path A: Slow DNS** +```typescript +// Implement DNS caching +const DNS_CACHE = new Map(); +// Cache DNS for 60s +``` + +**Path B: Connection Timeout** +```typescript +// Increase timeout +const controller = new AbortController(); +setTimeout(() => controller.abort(), 30000); // 30s +``` + +**Path C: CORS Error** +```typescript +// Add CORS headers +response.headers.set('Access-Control-Allow-Origin', origin); +response.headers.set('Access-Control-Allow-Methods', 'GET,POST,PUT,DELETE'); +``` + +**Path D: SSL/TLS Issue** +```bash +# Check certificate +openssl s_client -connect api.partner.com:443 + +# Verify not expired +# Check certificate chain +``` + +### Prevention +- DNS caching (60s TTL) +- Appropriate timeouts (30s for external APIs) +- Health checks for external dependencies +- Circuit breakers for failures +- Monitor external API latency + +--- + +## Emergency Procedures (SEV1) + +**Immediate Actions**: +1. **Assess**: Users affected? Functionality broken? Data loss risk? +2. **Communicate**: Alert team, update status page +3. **Stop Bleeding**: `wrangler rollback` or disable feature +4. **Diagnose**: Logs, recent changes, metrics +5. **Fix**: Hotfix or workaround, test first +6. **Verify**: Monitor metrics, test functionality +7. **Postmortem**: Document, root cause, prevention + +--- + +## Escalation Matrix + +| Issue Type | First Response | Escalate To | Escalation Trigger | +|------------|---------------|-------------|-------------------| +| Worker errors | DevOps troubleshooter | incident-responder | SEV1/SEV2 | +| Performance | DevOps troubleshooter | performance-optimizer | >30min unresolved | +| Database | DevOps troubleshooter | data-validator | Schema issues | +| Security | DevOps troubleshooter | security-analyzer | Breach suspected | +| Application bugs | DevOps troubleshooter | smart-debug | Infrastructure ruled out | + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Full troubleshooting examples +- **Diagnostic Commands**: [diagnostic-commands.md](diagnostic-commands.md) - Command reference +- **Cloudflare Guide**: [cloudflare-workers-guide.md](cloudflare-workers-guide.md) - Platform-specific + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/devops-troubleshooting/templates/INDEX.md b/skills/devops-troubleshooting/templates/INDEX.md new file mode 100644 index 0000000..a4e9fb0 --- /dev/null +++ b/skills/devops-troubleshooting/templates/INDEX.md @@ -0,0 +1,81 @@ +# DevOps Troubleshooter Templates + +Ready-to-use templates for infrastructure incident response, deployment checklists, and performance investigations. + +## Available Templates + +### Incident Report Template + +**File**: [incident-report-template.md](incident-report-template.md) + +Comprehensive template for documenting infrastructure incidents: +- **Incident Overview**: Summary, impact, timeline +- **Root Cause Analysis**: What happened, why it happened +- **Resolution Steps**: What was done to fix it +- **Prevention Measures**: How to prevent recurrence +- **Lessons Learned**: What went well, what could improve + +**Use when**: Documenting production outages, degradations, or significant infrastructure issues + +**Copy and fill in** all sections for your specific incident. + +--- + +### Deployment Checklist + +**File**: [deployment-checklist.md](deployment-checklist.md) + +Pre-deployment and post-deployment verification checklist: +- **Pre-Deployment Verification**: Code review, tests, dependencies, configuration +- **Deployment Steps**: Backup, deploy, verify, rollback plan +- **Post-Deployment Monitoring**: Health checks, metrics, logs, alerts +- **Rollback Procedures**: When and how to rollback + +**Use when**: Deploying Cloudflare Workers, database migrations, infrastructure changes + +**Check off** each item before and after deployment. + +--- + +### Performance Investigation Template + +**File**: [performance-investigation-template.md](performance-investigation-template.md) + +Systematic template for investigating performance issues: +- **Performance Baseline**: Current metrics vs expected +- **Hypothesis Generation**: Potential root causes +- **Data Collection**: Profiling, metrics, logs +- **Analysis**: What the data reveals +- **Optimization Plan**: Prioritized fixes with impact estimates +- **Validation**: Before/after metrics + +**Use when**: API latency increases, database slow queries, high CPU/memory usage + +**Follow systematically** to diagnose and resolve performance problems. + +--- + +## Template Usage + +**How to use these templates**: +1. Copy the template file to your project documentation +2. Fill in all sections marked with `[FILL IN]` placeholders +3. Remove sections that don't apply (optional) +4. Share with your team for review + +**When to create reports**: +- **Incident Report**: After any production incident (SEV1-SEV3) +- **Deployment Checklist**: Before every production deployment +- **Performance Investigation**: When performance degrades >20% + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Real-world troubleshooting walkthroughs +- **Reference**: [Reference Index](../reference/INDEX.md) - Runbooks and diagnostic commands +- **Main Agent**: [devops-troubleshooter.md](../devops-troubleshooter.md) - DevOps troubleshooter agent + +--- + +Return to [main agent](../devops-troubleshooter.md) diff --git a/skills/memory-profiling/SKILL.md b/skills/memory-profiling/SKILL.md new file mode 100644 index 0000000..a528081 --- /dev/null +++ b/skills/memory-profiling/SKILL.md @@ -0,0 +1,85 @@ +--- +name: grey-haven-memory-profiling +description: "Identify memory leaks, inefficient allocations, and optimization opportunities in JavaScript/TypeScript and Python applications. Analyze heap snapshots, allocation patterns, garbage collection, and memory retention. Use when memory grows over time, high memory consumption detected, performance degradation, or when user mentions 'memory leak', 'memory usage', 'heap analysis', 'garbage collection', 'memory profiling', or 'out of memory'." +--- + +# Memory Profiling Skill + +Identify memory leaks, inefficiencies, and optimization opportunities in running applications through systematic heap analysis and allocation profiling. + +## Description + +Specialized memory profiling skill for analyzing allocation patterns, heap usage, garbage collection behavior, and memory retention in JavaScript/TypeScript (Node.js, Bun, browsers) and Python applications. Detect memory leaks, optimize memory usage, and prevent out-of-memory errors. + +## What's Included + +### Examples (`examples/`) +- **Memory leak detection** - Finding and fixing common leak patterns +- **Heap snapshot analysis** - Interpreting Chrome DevTools heap snapshots +- **Allocation profiling** - Tracking memory allocation over time +- **Real-world scenarios** - E-commerce app leak, API server memory growth + +### Reference Guides (`reference/`) +- **Profiling tools** - Chrome DevTools, Node.js inspector, Python memory_profiler +- **Memory concepts** - Heap, stack, GC algorithms, retention paths +- **Optimization techniques** - Object pooling, weak references, lazy loading +- **Common leak patterns** - Event listeners, closures, caching, timers + +### Templates (`templates/`) +- **Profiling report template** - Standardized memory analysis reports +- **Heap snapshot comparison template** - Before/after analysis +- **Memory budget template** - Setting and tracking memory limits + +### Checklists (`checklists/`) +- **Memory leak checklist** - Systematic leak detection process +- **Optimization checklist** - Memory optimization verification + +## Use This Skill When + +- ✅ Memory usage growing continuously over time +- ✅ High memory consumption detected (> 500MB for Node, > 1GB for Python) +- ✅ Performance degradation with prolonged runtime +- ✅ Out of memory errors in production +- ✅ Garbage collection causing performance issues +- ✅ Need to optimize memory footprint +- ✅ User mentions: "memory leak", "memory usage", "heap", "garbage collection", "OOM" + +## Related Agents + +- `memory-profiler` - Automated memory analysis and leak detection +- `performance-optimizer` - Broader performance optimization including memory + +## Quick Start + +```bash +# View leak detection examples +cat examples/memory-leak-detection.md + +# Check profiling tools reference +cat reference/profiling-tools.md + +# Use memory leak checklist +cat checklists/memory-leak-checklist.md +``` + +## Common Memory Issues + +1. **Event Listener Leaks** - Unremoved listeners holding references +2. **Closure Leaks** - Variables captured in closures never released +3. **Cache Leaks** - Unbounded caches growing indefinitely +4. **Timer Leaks** - setInterval/setTimeout not cleared +5. **DOM Leaks** - Detached DOM nodes retained in memory +6. **Circular References** - Objects referencing each other preventing GC + +## Typical Workflow + +1. **Detect**: Run profiler, take heap snapshots +2. **Analyze**: Compare snapshots, identify growing objects +3. **Locate**: Find retention paths, trace to source +4. **Fix**: Remove references, clean up resources +5. **Verify**: Re-profile to confirm fix + +--- + +**Skill Version**: 1.0 +**Last Updated**: 2025-11-09 diff --git a/skills/memory-profiling/examples/INDEX.md b/skills/memory-profiling/examples/INDEX.md new file mode 100644 index 0000000..d2bc19e --- /dev/null +++ b/skills/memory-profiling/examples/INDEX.md @@ -0,0 +1,86 @@ +# Memory Profiling Examples + +Production memory profiling implementations for Node.js and Python with leak detection, heap analysis, and optimization strategies. + +## Examples Overview + +### Node.js Memory Leak Detection + +**File**: [nodejs-memory-leak.md](nodejs-memory-leak.md) + +Identifying and fixing memory leaks in Node.js applications: +- **Memory leak detection**: Chrome DevTools, heapdump analysis +- **Common leak patterns**: Event listeners, closures, global variables +- **Heap snapshots**: Before/after comparison, retained object analysis +- **Real leak**: EventEmitter leak causing 2GB memory growth +- **Fix**: Proper cleanup with `removeListener()`, WeakMap for caching +- **Result**: Memory stabilized at 150MB (93% reduction) + +**Use when**: Node.js memory growing over time, debugging production memory issues + +--- + +### Python Memory Profiling with Scalene + +**File**: [python-scalene-profiling.md](python-scalene-profiling.md) + +Line-by-line memory profiling for Python applications: +- **Scalene setup**: Installation, pytest integration, CLI usage +- **Memory hotspots**: Line-by-line allocation tracking +- **CPU + Memory**: Combined profiling for performance bottlenecks +- **Real scenario**: 500MB dataset causing OOM, fixed with generators +- **Optimization**: List comprehension → generator (500MB → 5MB) +- **Result**: 99% memory reduction, no OOM errors + +**Use when**: Python memory spikes, profiling pytest tests, finding allocation hotspots + +--- + +### Database Connection Pool Leak + +**File**: [database-connection-leak.md](database-connection-leak.md) + +PostgreSQL connection pool exhaustion and memory leaks: +- **Symptom**: Connection pool maxed out, memory growing linearly +- **Root cause**: Unclosed connections in error paths, missing `finally` blocks +- **Detection**: Connection pool metrics, memory profiling +- **Fix**: Context managers (`with` statement), proper cleanup +- **Result**: Zero connection leaks, memory stable at 80MB + +**Use when**: Database connection errors, "too many clients" errors, connection pool issues + +--- + +### Large Dataset Memory Optimization + +**File**: [large-dataset-optimization.md](large-dataset-optimization.md) + +Memory-efficient data processing for large datasets: +- **Problem**: Loading 10GB CSV into memory (OOM killer) +- **Solutions**: Streaming with `pandas.read_csv(chunksize)`, generators, memory mapping +- **Techniques**: Lazy evaluation, columnar processing, batch processing +- **Before/After**: 10GB memory → 500MB (95% reduction) +- **Tools**: Pandas chunking, Dask for parallel processing + +**Use when**: Processing large files, OOM errors, batch data processing + +--- + +## Quick Navigation + +| Topic | File | Lines | Focus | +|-------|------|-------|-------| +| **Node.js Leaks** | [nodejs-memory-leak.md](nodejs-memory-leak.md) | ~450 | EventEmitter, heap snapshots | +| **Python Scalene** | [python-scalene-profiling.md](python-scalene-profiling.md) | ~420 | Line-by-line profiling | +| **DB Connection Leaks** | [database-connection-leak.md](database-connection-leak.md) | ~380 | Connection pool management | +| **Large Datasets** | [large-dataset-optimization.md](large-dataset-optimization.md) | ~400 | Streaming, chunking | + +## Related Documentation + +- **Reference**: [Reference Index](../reference/INDEX.md) - Memory patterns, profiling tools +- **Templates**: [Templates Index](../templates/INDEX.md) - Profiling report template +- **Main Agent**: [memory-profiler.md](../memory-profiler.md) - Memory profiler agent + +--- + +Return to [main agent](../memory-profiler.md) diff --git a/skills/memory-profiling/examples/database-connection-leak.md b/skills/memory-profiling/examples/database-connection-leak.md new file mode 100644 index 0000000..ecd9af9 --- /dev/null +++ b/skills/memory-profiling/examples/database-connection-leak.md @@ -0,0 +1,490 @@ +# Database Connection Pool Memory Leaks + +Detecting and fixing PostgreSQL connection pool leaks in FastAPI applications using connection monitoring and proper cleanup patterns. + +## Overview + +**Before Optimization**: +- Active connections: 95/100 (pool exhausted) +- Connection timeouts: 15-20/min during peak +- Memory growth: 100MB/hour (unclosed connections) +- Service restarts: 3-4x/day + +**After Optimization**: +- Active connections: 8-12/100 (healthy pool) +- Connection timeouts: 0/day +- Memory growth: 0MB/hour (stable) +- Service restarts: 0/month + +**Tools**: asyncpg, SQLModel, psycopg3, pg_stat_activity, Prometheus + +## 1. Connection Pool Architecture + +### Grey Haven Stack: PostgreSQL + SQLModel + +**Connection Pool Configuration**: +```python +# database.py +from sqlmodel import create_engine +from sqlalchemy.pool import QueuePool + +# ❌ VULNERABLE: No max_overflow, no timeout +engine = create_engine( + "postgresql://user:pass@localhost/db", + poolclass=QueuePool, + pool_size=20, + echo=True +) + +# ✅ SECURE: Proper pool configuration +engine = create_engine( + "postgresql://user:pass@localhost/db", + poolclass=QueuePool, + pool_size=20, # Core connections + max_overflow=10, # Max additional connections + pool_timeout=30, # Wait timeout (seconds) + pool_recycle=3600, # Recycle after 1 hour + pool_pre_ping=True, # Verify connection before use + echo=False +) +``` + +**Pool Health Monitoring**: +```python +# monitoring.py +from prometheus_client import Gauge + +# Prometheus metrics +db_pool_size = Gauge('db_pool_connections_total', 'Total pool size') +db_pool_active = Gauge('db_pool_connections_active', 'Active connections') +db_pool_idle = Gauge('db_pool_connections_idle', 'Idle connections') +db_pool_overflow = Gauge('db_pool_connections_overflow', 'Overflow connections') + +def record_pool_metrics(engine): + pool = engine.pool + db_pool_size.set(pool.size()) + db_pool_active.set(pool.checkedout()) + db_pool_idle.set(pool.size() - pool.checkedout()) + db_pool_overflow.set(pool.overflow()) +``` + +## 2. Common Leak Pattern: Unclosed Connections + +### Vulnerable Code (Connection Leak) + +```python +# api/orders.py (BEFORE) +from fastapi import APIRouter, Depends +from sqlmodel import Session, select +from database import engine + +router = APIRouter() + +@router.get("/orders") +async def get_orders(): + # ❌ LEAK: Connection never closed + session = Session(engine) + + # If exception occurs here, session never closed + orders = session.exec(select(Order)).all() + + # If return happens here, session never closed + return orders + + # session.close() never reached if early return/exception + session.close() +``` + +**What Happens**: +1. Every request acquires connection from pool +2. Exception/early return prevents `session.close()` +3. Connection remains in "active" state +4. Pool exhausts after 100 requests (pool_size=100) +5. New requests timeout waiting for connection + +**Memory Impact**: +``` +Initial pool: 20 connections (40MB) +After 1 hour: 95 leaked connections (190MB) +After 6 hours: Pool exhausted + 100MB leaked memory +``` + +### Fixed Code (Context Manager) + +```python +# api/orders.py (AFTER) +from fastapi import APIRouter, Depends +from sqlmodel import Session, select +from database import engine, get_session +from contextlib import contextmanager + +router = APIRouter() + +# ✅ Option 1: FastAPI dependency injection (recommended) +def get_session(): + """Session dependency with automatic cleanup""" + with Session(engine) as session: + yield session + +@router.get("/orders") +async def get_orders(session: Session = Depends(get_session)): + # Session automatically closed after request + orders = session.exec(select(Order)).all() + return orders + + +# ✅ Option 2: Explicit context manager +@router.get("/orders-alt") +async def get_orders_alt(): + with Session(engine) as session: + orders = session.exec(select(Order)).all() + return orders + # Session guaranteed to close (even on exception) +``` + +**Why This Works**: +- Context manager ensures `session.close()` called in `__exit__` +- Works even if exception raised +- Works even if early return +- FastAPI `Depends()` handles async cleanup + +## 3. Async Connection Leaks (asyncpg) + +### Vulnerable Async Pattern + +```python +# api/analytics.py (BEFORE) +import asyncpg +from fastapi import APIRouter + +router = APIRouter() + +@router.get("/analytics") +async def get_analytics(): + # ❌ LEAK: Connection never closed + conn = await asyncpg.connect( + user='postgres', + password='secret', + database='analytics' + ) + + # Exception here = connection leaked + result = await conn.fetch('SELECT * FROM metrics WHERE date > $1', date) + + # Early return = connection leaked + if not result: + return [] + + await conn.close() # Never reached + return result +``` + +### Fixed Async Pattern + +```python +# api/analytics.py (AFTER) +import asyncpg +from fastapi import APIRouter +from contextlib import asynccontextmanager + +router = APIRouter() + +# ✅ Connection pool (shared across requests) +pool: asyncpg.Pool = None + +@asynccontextmanager +async def get_db_connection(): + """Async context manager for connections""" + conn = await pool.acquire() + try: + yield conn + finally: + await pool.release(conn) + +@router.get("/analytics") +async def get_analytics(): + async with get_db_connection() as conn: + result = await conn.fetch( + 'SELECT * FROM metrics WHERE date > $1', + date + ) + return result + # Connection automatically released to pool +``` + +**Pool Setup** (application startup): +```python +# main.py +from fastapi import FastAPI +import asyncpg + +app = FastAPI() + +@app.on_event("startup") +async def startup(): + global pool + pool = await asyncpg.create_pool( + user='postgres', + password='secret', + database='analytics', + min_size=10, # Minimum connections + max_size=20, # Maximum connections + max_inactive_connection_lifetime=300 # Recycle after 5 min + ) + +@app.on_event("shutdown") +async def shutdown(): + await pool.close() +``` + +## 4. Transaction Leak Detection + +### Monitoring Active Connections + +**PostgreSQL Query**: +```sql +-- Show active connections with details +SELECT + pid, + usename, + application_name, + client_addr, + state, + query, + state_change, + NOW() - state_change AS duration +FROM pg_stat_activity +WHERE state != 'idle' +ORDER BY duration DESC; +``` + +**Prometheus Metrics**: +```python +# monitoring.py +from prometheus_client import Gauge +import asyncpg + +db_connections_active = Gauge( + 'db_connections_active', + 'Active database connections', + ['state'] +) + +async def monitor_connections(pool: asyncpg.Pool): + """Monitor PostgreSQL connections every 30 seconds""" + async with pool.acquire() as conn: + rows = await conn.fetch(""" + SELECT state, COUNT(*) as count + FROM pg_stat_activity + WHERE datname = current_database() + GROUP BY state + """) + + for row in rows: + db_connections_active.labels(state=row['state']).set(row['count']) +``` + +**Grafana Alert** (connection leak): +```yaml +alert: DatabaseConnectionLeak +expr: db_connections_active{state="active"} > 80 +for: 5m +annotations: + summary: "Potential connection leak ({{ $value }} active connections)" + description: "Active connections have been above 80 for 5+ minutes" +``` + +## 5. Real-World Fix: FastAPI Order Service + +### Before (Connection Pool Exhaustion) + +```python +# services/order_processor.py (BEFORE) +from sqlmodel import Session, select +from database import engine +from models import Order, OrderItem + +class OrderProcessor: + async def process_order(self, order_id: int): + # ❌ LEAK: Multiple sessions, some never closed + session1 = Session(engine) + order = session1.get(Order, order_id) + + if not order: + # Early return = session1 leaked + return None + + # ❌ LEAK: Second session + session2 = Session(engine) + items = session2.exec( + select(OrderItem).where(OrderItem.order_id == order_id) + ).all() + + # Exception here = both sessions leaked + total = sum(item.price * item.quantity for item in items) + + order.total = total + session1.commit() + + # Only session1 closed, session2 leaked + session1.close() + return order +``` + +**Metrics (Before)**: +``` +Connection pool: 100 connections +Active connections after 1 hour: 95/100 +Leaked connections: ~12/min +Memory growth: 100MB/hour +Pool exhaustion: Every 6-8 hours +``` + +### After (Proper Resource Management) + +```python +# services/order_processor.py (AFTER) +from sqlmodel import Session, select +from database import engine, get_session +from models import Order, OrderItem +from contextlib import contextmanager + +class OrderProcessor: + async def process_order(self, order_id: int): + # ✅ Single session, guaranteed cleanup + with Session(engine) as session: + # Query order + order = session.get(Order, order_id) + if not order: + return None + + # Query items (same session) + items = session.exec( + select(OrderItem).where(OrderItem.order_id == order_id) + ).all() + + # Calculate total + total = sum(item.price * item.quantity for item in items) + + # Update order + order.total = total + session.add(order) + session.commit() + session.refresh(order) + + return order + # Session automatically closed (even on exception) +``` + +**Metrics (After)**: +``` +Connection pool: 100 connections +Active connections: 8-12/100 (stable) +Leaked connections: 0/day +Memory growth: 0MB/hour +Pool exhaustion: Never (0 incidents/month) +``` + +## 6. Connection Pool Configuration Best Practices + +### Recommended Settings (Grey Haven Stack) + +```python +# database.py - Production settings +from sqlmodel import create_engine +from sqlalchemy.pool import QueuePool + +engine = create_engine( + database_url, + poolclass=QueuePool, + pool_size=20, # (workers * connections/worker) + buffer + max_overflow=10, # 50% of pool_size + pool_timeout=30, # Wait timeout + pool_recycle=3600, # Recycle after 1h + pool_pre_ping=True # Health check +) +``` + +**Pool Size Formula**: `pool_size = (workers * conn_per_worker) + buffer` +Example: `(4 workers * 3 conn) + 8 buffer = 20` + +## 7. Testing Connection Cleanup + +### Pytest Fixture for Connection Tracking + +```python +# tests/conftest.py +import pytest +from sqlmodel import Session, create_engine + +@pytest.fixture +def engine(): + """Test engine with connection tracking""" + test_engine = create_engine("postgresql://test:test@localhost/test_db", pool_size=5) + initial_active = test_engine.pool.checkedout() + yield test_engine + final_active = test_engine.pool.checkedout() + assert final_active == initial_active, f"Leaked {final_active - initial_active} connections" + +@pytest.mark.asyncio +async def test_no_connection_leak_under_load(engine): + """Simulate 1000 concurrent requests""" + initial = engine.pool.checkedout() + tasks = [get_orders() for _ in range(1000)] + await asyncio.gather(*tasks) + await asyncio.sleep(1) + assert engine.pool.checkedout() == initial, "Connection leak detected" +``` + +## 8. CI/CD Integration + +```yaml +# .github/workflows/connection-leak-test.yml +name: Connection Leak Detection +on: [pull_request] +jobs: + leak-test: + runs-on: ubuntu-latest + services: + postgres: + image: postgres:15 + env: {POSTGRES_PASSWORD: test, POSTGRES_DB: test_db} + ports: [5432:5432] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: {python-version: '3.11'} + - run: pip install -r requirements.txt pytest pytest-asyncio + - run: pytest tests/test_connection_leaks.py -v +``` + +## 9. Results and Impact + +### Before vs After Metrics + +| Metric | Before | After | Impact | +|--------|--------|-------|--------| +| **Active Connections** | 95/100 (95%) | 8-12/100 (10%) | **85% reduction** | +| **Connection Timeouts** | 15-20/min | 0/day | **100% eliminated** | +| **Memory Growth** | 100MB/hour | 0MB/hour | **100% eliminated** | +| **Service Restarts** | 3-4x/day | 0/month | **100% eliminated** | +| **Pool Wait Time (p95)** | 5.2s | 0.01s | **99.8% faster** | + +### Key Optimizations Applied + +1. **Context Managers**: Guaranteed connection cleanup (even on exceptions) +2. **FastAPI Dependencies**: Automatic session lifecycle management +3. **Connection Pooling**: Proper pool_size, max_overflow, pool_timeout +4. **Prometheus Monitoring**: Real-time pool saturation metrics +5. **Load Testing**: CI/CD checks for connection leaks + +## Related Documentation + +- **Node.js Leaks**: [nodejs-memory-leak.md](nodejs-memory-leak.md) +- **Python Profiling**: [python-scalene-profiling.md](python-scalene-profiling.md) +- **Large Datasets**: [large-dataset-optimization.md](large-dataset-optimization.md) +- **Reference**: [../reference/profiling-tools.md](../reference/profiling-tools.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/memory-profiling/examples/large-dataset-optimization.md b/skills/memory-profiling/examples/large-dataset-optimization.md new file mode 100644 index 0000000..ddb0c50 --- /dev/null +++ b/skills/memory-profiling/examples/large-dataset-optimization.md @@ -0,0 +1,452 @@ +# Large Dataset Memory Optimization + +Memory-efficient patterns for processing multi-GB datasets in Python and Node.js without OOM errors. + +## Overview + +**Before Optimization**: +- Dataset size: 10GB CSV (50M rows) +- Memory usage: 20GB (2x dataset size) +- Processing time: 45 minutes +- OOM errors: Frequent (3-4x/day) + +**After Optimization**: +- Dataset size: Same (10GB, 50M rows) +- Memory usage: 500MB (constant) +- Processing time: 12 minutes (73% faster) +- OOM errors: 0/month + +**Tools**: Polars, pandas chunking, generators, streaming parsers + +## 1. Problem: Loading Entire Dataset + +### Vulnerable Pattern (Pandas read_csv) + +```python +# analysis.py (BEFORE) +import pandas as pd + +def analyze_sales_data(filename: str): + # ❌ Loads entire 10GB file into memory + df = pd.read_csv(filename) # 20GB RAM usage + + # ❌ Creates copies for each operation + df['total'] = df['quantity'] * df['price'] # +10GB + df_filtered = df[df['total'] > 1000] # +8GB + df_sorted = df_filtered.sort_values('total', ascending=False) # +8GB + + # Peak memory: 46GB for 10GB file! + return df_sorted.head(100) +``` + +**Memory Profile**: +``` +Step 1 (read_csv): 20GB +Step 2 (calculation): +10GB = 30GB +Step 3 (filter): +8GB = 38GB +Step 4 (sort): +8GB = 46GB +Result: OOM on 32GB machine +``` + +## 2. Solution 1: Pandas Chunking + +### Chunk-Based Processing + +```python +# analysis.py (AFTER - Chunking) +import pandas as pd +from typing import Iterator + +def analyze_sales_data_chunked(filename: str, chunk_size: int = 100000): + """Process 100K rows at a time (constant memory)""" + + top_sales = [] + + # ✅ Process in chunks (100K rows = ~50MB each) + for chunk in pd.read_csv(filename, chunksize=chunk_size): + # Calculate total (in-place when possible) + chunk['total'] = chunk['quantity'] * chunk['price'] + + # Filter high-value sales + filtered = chunk[chunk['total'] > 1000] + + # Keep top 100 from this chunk + top_chunk = filtered.nlargest(100, 'total') + top_sales.append(top_chunk) + + # chunk goes out of scope, memory freed + + # Combine top results from all chunks + final_df = pd.concat(top_sales).nlargest(100, 'total') + return final_df +``` + +**Memory Profile (Chunked)**: +``` +Chunk 1: 50MB (process) → 10MB (top 100) → garbage collected +Chunk 2: 50MB (process) → 10MB (top 100) → garbage collected +... +Chunk 500: 50MB (process) → 10MB (top 100) → garbage collected +Final combine: 500 * 10MB = 500MB total +Peak memory: 500MB (99% reduction!) +``` + +## 3. Solution 2: Polars (Lazy Evaluation) + +### Polars for Large Datasets + +**Why Polars**: +- 10-100x faster than pandas +- True streaming (doesn't load entire file) +- Query optimizer (like SQL databases) +- Parallel processing (uses all CPU cores) + +```python +# analysis.py (POLARS) +import polars as pl + +def analyze_sales_data_polars(filename: str): + """Polars lazy evaluation - constant memory""" + + result = ( + pl.scan_csv(filename) # ✅ Lazy: doesn't load yet + .with_columns([ + (pl.col('quantity') * pl.col('price')).alias('total') + ]) + .filter(pl.col('total') > 1000) + .sort('total', descending=True) + .head(100) + .collect(streaming=True) # ✅ Streaming: processes in chunks + ) + + return result +``` + +**Memory Profile (Polars Streaming)**: +``` +Memory usage: 200-300MB (constant) +Processing: Parallel chunks, optimized query plan +Time: 12 minutes vs 45 minutes (pandas) +``` + +## 4. Node.js Streaming + +### CSV Streaming with csv-parser + +```typescript +// analysis.ts (BEFORE) +import fs from 'fs'; +import Papa from 'papaparse'; + +async function analyzeSalesData(filename: string) { + // ❌ Loads entire 10GB file + const fileContent = fs.readFileSync(filename, 'utf-8'); // 20GB RAM + const parsed = Papa.parse(fileContent, { header: true }); // +10GB + + // Process all rows + const results = parsed.data.map(row => ({ + total: row.quantity * row.price + })); + + return results; // 30GB total +} +``` + +**Fixed with Streaming**: +```typescript +// analysis.ts (AFTER - Streaming) +import fs from 'fs'; +import csv from 'csv-parser'; +import { pipeline } from 'stream/promises'; + +async function analyzeSalesDataStreaming(filename: string) { + const topSales: Array<{row: any, total: number}> = []; + + await pipeline( + fs.createReadStream(filename), // ✅ Stream (not load all) + csv(), + async function* (source) { + for await (const row of source) { + const total = row.quantity * row.price; + + if (total > 1000) { + topSales.push({ row, total }); + + // Keep only top 100 (memory bounded) + if (topSales.length > 100) { + topSales.sort((a, b) => b.total - a.total); + topSales.length = 100; + } + } + } + yield topSales; + } + ); + + return topSales; +} +``` + +**Memory Profile (Streaming)**: +``` +Buffer: 64KB (stream chunk size) +Processing: One row at a time +Array: 100 rows max (bounded) +Peak memory: 5MB vs 30GB (99.98% reduction!) +``` + +## 5. Generator Pattern (Python) + +### Memory-Efficient Pipeline + +```python +# pipeline.py (Generator-based) +from typing import Iterator +import csv + +def read_csv_streaming(filename: str) -> Iterator[dict]: + """Read CSV line by line (not all at once)""" + with open(filename, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + yield row # ✅ One row at a time + +def calculate_totals(rows: Iterator[dict]) -> Iterator[dict]: + """Calculate totals (lazy)""" + for row in rows: + row['total'] = float(row['quantity']) * float(row['price']) + yield row + +def filter_high_value(rows: Iterator[dict], threshold: float = 1000) -> Iterator[dict]: + """Filter high-value sales (lazy)""" + for row in rows: + if row['total'] > threshold: + yield row + +def top_n(rows: Iterator[dict], n: int = 100) -> list[dict]: + """Keep top N rows (bounded memory)""" + import heapq + return heapq.nlargest(n, rows, key=lambda x: x['total']) + +# ✅ Pipeline: each stage processes one row at a time +def analyze_sales_pipeline(filename: str): + rows = read_csv_streaming(filename) + with_totals = calculate_totals(rows) + high_value = filter_high_value(with_totals) + top_100 = top_n(high_value, 100) + return top_100 +``` + +**Memory Profile (Generator Pipeline)**: +``` +Stage 1 (read): 1 row (few KB) +Stage 2 (calculate): 1 row (few KB) +Stage 3 (filter): 1 row (few KB) +Stage 4 (top_n): 100 rows (bounded) +Peak memory: <1MB (constant) +``` + +## 6. Real-World: E-Commerce Analytics + +### Before (Pandas load_all) + +```python +# analytics_service.py (BEFORE) +import pandas as pd + +class AnalyticsService: + def generate_sales_report(self, start_date: str, end_date: str): + # ❌ Load entire orders table (10GB) + orders = pd.read_sql( + "SELECT * FROM orders WHERE date BETWEEN %s AND %s", + engine, + params=(start_date, end_date) + ) # 20GB RAM + + # ❌ Load entire order_items (50GB) + items = pd.read_sql("SELECT * FROM order_items", engine) # +100GB RAM + + # Join (creates another copy) + merged = orders.merge(items, on='order_id') # +150GB + + # Aggregate + summary = merged.groupby('category').agg({ + 'total': 'sum', + 'quantity': 'sum' + }) + + return summary # Peak: 270GB - OOM! +``` + +### After (Database Aggregation + Chunking) + +```python +# analytics_service.py (AFTER) +import pandas as pd + +class AnalyticsService: + def generate_sales_report(self, start_date: str, end_date: str): + # ✅ Aggregate in database (PostgreSQL does the work) + query = """ + SELECT + oi.category, + SUM(oi.price * oi.quantity) as total, + SUM(oi.quantity) as quantity + FROM orders o + JOIN order_items oi ON o.id = oi.order_id + WHERE o.date BETWEEN %(start)s AND %(end)s + GROUP BY oi.category + """ + + # Result: aggregated data (few KB, not 270GB!) + summary = pd.read_sql( + query, + engine, + params={'start': start_date, 'end': end_date} + ) + + return summary # Peak: 1MB vs 270GB +``` + +**Metrics**: +``` +Before: 270GB RAM, OOM error +After: 1MB RAM, 99.9996% reduction +Time: 45 min → 30 seconds (90x faster) +``` + +## 7. Dask for Parallel Processing + +### Dask DataFrame (Parallel Chunking) + +```python +# analysis_dask.py +import dask.dataframe as dd + +def analyze_sales_data_dask(filename: str): + """Process in parallel chunks across CPU cores""" + + # ✅ Lazy loading, parallel processing + df = dd.read_csv( + filename, + blocksize='64MB' # Process 64MB chunks + ) + + # All operations are lazy (no computation yet) + df['total'] = df['quantity'] * df['price'] + filtered = df[df['total'] > 1000] + top_100 = filtered.nlargest(100, 'total') + + # ✅ Trigger computation (parallel across cores) + result = top_100.compute() + + return result +``` + +**Memory Profile (Dask)**: +``` +Workers: 8 (one per CPU core) +Memory per worker: 100MB +Total memory: 800MB vs 46GB +Speed: 4-8x faster (parallel) +``` + +## 8. Memory Monitoring + +### Track Memory Usage During Processing + +```python +# monitor.py +import tracemalloc +import psutil +from contextlib import contextmanager + +@contextmanager +def memory_monitor(label: str): + """Monitor memory usage of code block""" + + # Start tracking + tracemalloc.start() + process = psutil.Process() + mem_before = process.memory_info().rss / 1024 / 1024 # MB + + yield + + # Measure after + mem_after = process.memory_info().rss / 1024 / 1024 + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + print(f"{label}:") + print(f" Memory before: {mem_before:.1f} MB") + print(f" Memory after: {mem_after:.1f} MB") + print(f" Memory delta: {mem_after - mem_before:.1f} MB") + print(f" Peak traced: {peak / 1024 / 1024:.1f} MB") + +# Usage +with memory_monitor("Pandas load_all"): + df = pd.read_csv("large_file.csv") # Shows high memory usage + +with memory_monitor("Polars streaming"): + df = pl.scan_csv("large_file.csv").collect(streaming=True) # Low memory +``` + +## 9. Optimization Decision Tree + +**Choose the right tool based on dataset size**: + +``` +Dataset < 1GB: + → Use pandas.read_csv() (simple, fast) + +Dataset 1-10GB: + → Use pandas chunking (chunksize=100000) + → Or Polars streaming (faster, less memory) + +Dataset 10-100GB: + → Use Polars streaming (best performance) + → Or Dask (parallel processing) + → Or Database aggregation (PostgreSQL, ClickHouse) + +Dataset > 100GB: + → Database aggregation (required) + → Or Spark/Ray (distributed computing) + → Never load into memory +``` + +## 10. Results and Impact + +### Before vs After Metrics + +| Metric | Before (pandas) | After (Polars) | Impact | +|--------|----------------|----------------|--------| +| **Memory Usage** | 46GB | 300MB | **99.3% reduction** | +| **Processing Time** | 45 min | 12 min | **73% faster** | +| **OOM Errors** | 3-4/day | 0/month | **100% eliminated** | +| **Max Dataset Size** | 10GB | 500GB+ | **50x scalability** | + +### Key Optimizations Applied + +1. **Chunking**: Process 100K rows at a time (constant memory) +2. **Lazy Evaluation**: Polars/Dask don't load until needed +3. **Streaming**: One row at a time (generators, Node.js streams) +4. **Database Aggregation**: Let PostgreSQL do the work +5. **Bounded Memory**: heapq.nlargest() keeps top N (not all rows) + +### Cost Savings + +**Infrastructure costs**: +- Before: r5.8xlarge (256GB RAM) = $1.344/hour +- After: r5.large (16GB RAM) = $0.084/hour +- **Savings**: 94% reduction ($23,000/year per service) + +## Related Documentation + +- **Node.js Leaks**: [nodejs-memory-leak.md](nodejs-memory-leak.md) +- **Python Profiling**: [python-scalene-profiling.md](python-scalene-profiling.md) +- **DB Leaks**: [database-connection-leak.md](database-connection-leak.md) +- **Reference**: [../reference/memory-optimization-patterns.md](../reference/memory-optimization-patterns.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/memory-profiling/examples/nodejs-memory-leak.md b/skills/memory-profiling/examples/nodejs-memory-leak.md new file mode 100644 index 0000000..70001b3 --- /dev/null +++ b/skills/memory-profiling/examples/nodejs-memory-leak.md @@ -0,0 +1,490 @@ +# Node.js Memory Leak Detection + +Identifying and fixing memory leaks in Node.js applications using Chrome DevTools, heapdump, and memory profiling techniques. + +## Overview + +**Symptoms Before Fix**: +- Memory usage: 150MB → 2GB over 6 hours +- Heap size growing linearly (5MB/minute) +- V8 garbage collection ineffective +- Production outages (OOM killer) + +**After Fix**: +- Memory stable at 150MB (93% reduction) +- Heap size constant over time +- Zero OOM errors in 30 days +- Proper resource cleanup + +**Tools**: Chrome DevTools, heapdump, memwatch-next, Prometheus monitoring + +## 1. Memory Leak Symptoms + +### Linear Memory Growth + +```bash +# Monitor Node.js memory usage +node --expose-gc --inspect app.js + +# Connect Chrome DevTools: chrome://inspect +# Memory tab → Take heap snapshot every 5 minutes +``` + +**Heap growth pattern**: +``` +Time | Heap Size | External | Total +------|-----------|----------|------- +0 min | 50MB | 10MB | 60MB +5 min | 75MB | 15MB | 90MB +10min | 100MB | 20MB | 120MB +15min | 125MB | 25MB | 150MB +... | ... | ... | ... +6 hrs | 1.8GB | 200MB | 2GB +``` + +**Diagnosis**: Linear growth indicates memory leak (not normal sawtooth GC pattern) + +### High GC Activity + +```javascript +// Monitor GC events +const v8 = require('v8'); +const memoryUsage = process.memoryUsage(); + +setInterval(() => { + const usage = process.memoryUsage(); + console.log({ + heapUsed: `${Math.round(usage.heapUsed / 1024 / 1024)}MB`, + heapTotal: `${Math.round(usage.heapTotal / 1024 / 1024)}MB`, + external: `${Math.round(usage.external / 1024 / 1024)}MB`, + rss: `${Math.round(usage.rss / 1024 / 1024)}MB` + }); +}, 60000); // Every minute +``` + +**Output showing leak**: +``` +{heapUsed: '75MB', heapTotal: '100MB', external: '15MB', rss: '120MB'} +{heapUsed: '100MB', heapTotal: '130MB', external: '20MB', rss: '150MB'} +{heapUsed: '125MB', heapTotal: '160MB', external: '25MB', rss: '185MB'} +``` + +## 2. Heap Snapshot Analysis + +### Taking Heap Snapshots + +```javascript +// Generate heap snapshot programmatically +const v8 = require('v8'); +const fs = require('fs'); + +function takeHeapSnapshot(filename) { + const heapSnapshot = v8.writeHeapSnapshot(filename); + console.log(`Heap snapshot written to ${heapSnapshot}`); +} + +// Take snapshot every hour +setInterval(() => { + const timestamp = new Date().toISOString().replace(/:/g, '-'); + takeHeapSnapshot(`heap-${timestamp}.heapsnapshot`); +}, 3600000); +``` + +### Analyzing Snapshots in Chrome DevTools + +**Steps**: +1. Load two snapshots (before and after 1 hour) +2. Compare snapshots (Comparison view) +3. Sort by "Size Delta" (descending) +4. Look for objects growing significantly + +**Example Analysis**: +``` +Object Type | Count | Size Delta | Retained Size +----------------------|--------|------------|--------------- +(array) | +5,000 | +50MB | +60MB +EventEmitter | +1,200 | +12MB | +15MB +Closure (anonymous) | +800 | +8MB | +10MB +``` + +**Diagnosis**: EventEmitter count growing = likely event listener leak + +### Retained Objects Analysis + +```javascript +// Chrome DevTools → Heap Snapshot → Summary → sort by "Retained Size" +// Click object → view Retainer tree +``` + +**Retainer tree example** (EventEmitter leak): +``` +EventEmitter @123456 + ← listeners: Array[50] + ← _events.data: Array + ← EventEmitter @123456 (self-reference leak!) +``` + +## 3. Common Memory Leak Patterns + +### Pattern 1: Event Listener Leak + +**Vulnerable Code**: +```typescript +// ❌ LEAK: EventEmitter listeners never removed +import {EventEmitter} from 'events'; + +class DataProcessor { + private emitter = new EventEmitter(); + + async processOrders() { + // Add listener every time function called + this.emitter.on('data', (data) => { + console.log('Processing:', data); + }); + + // Emit 1000 events + for (let i = 0; i < 1000; i++) { + this.emitter.emit('data', {id: i}); + } + } +} + +// Called 1000 times = 1000 listeners accumulate! +setInterval(() => new DataProcessor().processOrders(), 1000); +``` + +**Result**: 1000 listeners/second = 3.6M listeners/hour → 2GB memory leak + +**Fixed Code**: +```typescript +// ✅ FIXED: Remove listener after use +class DataProcessor { + private emitter = new EventEmitter(); + + async processOrders() { + const handler = (data) => { + console.log('Processing:', data); + }; + + this.emitter.on('data', handler); + + try { + for (let i = 0; i < 1000; i++) { + this.emitter.emit('data', {id: i}); + } + } finally { + // ✅ Clean up listener + this.emitter.removeListener('data', handler); + } + } +} +``` + +**Better**: Use `once()` for one-time listeners: +```typescript +this.emitter.once('data', handler); // Auto-removed after first emit +``` + +### Pattern 2: Closure Leak + +**Vulnerable Code**: +```typescript +// ❌ LEAK: Closure captures large object +const cache = new Map(); + +function processRequest(userId: string) { + const largeData = fetchLargeDataset(userId); // 10MB object + + // Closure captures entire largeData + cache.set(userId, () => { + return largeData.summary; // Only need summary (1KB) + }); +} + +// Called for 1000 users = 10GB in cache! +``` + +**Fixed Code**: +```typescript +// ✅ FIXED: Only store what you need +const cache = new Map(); + +function processRequest(userId: string) { + const largeData = fetchLargeDataset(userId); + const summary = largeData.summary; // Extract only 1KB + + // Store minimal data + cache.set(userId, () => summary); +} + +// 1000 users = 1MB in cache ✅ +``` + +### Pattern 3: Global Variable Accumulation + +**Vulnerable Code**: +```typescript +// ❌ LEAK: Global array keeps growing +const requestLog: Request[] = []; + +app.post('/api/orders', (req, res) => { + requestLog.push(req); // Never removed! + // ... process order +}); + +// 1M requests = 1M objects in memory permanently +``` + +**Fixed Code**: +```typescript +// ✅ FIXED: Use LRU cache with size limit +import LRU from 'lru-cache'; + +const requestLog = new LRU({ + max: 1000, // Maximum 1000 items + ttl: 1000 * 60 * 5 // 5-minute TTL +}); + +app.post('/api/orders', (req, res) => { + requestLog.set(req.id, req); // Auto-evicts old items +}); +``` + +### Pattern 4: Forgotten Timers/Intervals + +**Vulnerable Code**: +```typescript +// ❌ LEAK: setInterval never cleared +class ReportGenerator { + private data: any[] = []; + + start() { + setInterval(() => { + this.data.push(generateReport()); // Accumulates forever + }, 60000); + } +} + +// Each instance leaks! +const generator = new ReportGenerator(); +generator.start(); +``` + +**Fixed Code**: +```typescript +// ✅ FIXED: Clear interval on cleanup +class ReportGenerator { + private data: any[] = []; + private intervalId?: NodeJS.Timeout; + + start() { + this.intervalId = setInterval(() => { + this.data.push(generateReport()); + }, 60000); + } + + stop() { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = undefined; + this.data = []; // Clear accumulated data + } + } +} +``` + +## 4. Memory Profiling with memwatch-next + +### Installation + +```bash +bun add memwatch-next +``` + +### Leak Detection + +```typescript +// memory-monitor.ts +import memwatch from 'memwatch-next'; + +// Detect memory leaks +memwatch.on('leak', (info) => { + console.error('Memory leak detected:', { + growth: info.growth, + reason: info.reason, + current_base: `${Math.round(info.current_base / 1024 / 1024)}MB`, + leaked: `${Math.round((info.current_base - info.start) / 1024 / 1024)}MB` + }); + + // Alert to PagerDuty/Slack + alertOps('Memory leak detected', info); +}); + +// Monitor GC stats +memwatch.on('stats', (stats) => { + console.log('GC stats:', { + used_heap_size: `${Math.round(stats.used_heap_size / 1024 / 1024)}MB`, + heap_size_limit: `${Math.round(stats.heap_size_limit / 1024 / 1024)}MB`, + num_full_gc: stats.num_full_gc, + num_inc_gc: stats.num_inc_gc + }); +}); +``` + +### HeapDiff for Leak Analysis + +```typescript +import memwatch from 'memwatch-next'; + +const hd = new memwatch.HeapDiff(); + +// Simulate leak +const leak: any[] = []; +for (let i = 0; i < 10000; i++) { + leak.push({data: new Array(1000).fill('x')}); +} + +// Compare heaps +const diff = hd.end(); +console.log('Heap diff:', JSON.stringify(diff, null, 2)); + +// Output: +// { +// "before": {"nodes": 12345, "size": 50000000}, +// "after": {"nodes": 22345, "size": 150000000}, +// "change": { +// "size_bytes": 100000000, // 100MB leak! +// "size": "100.00MB", +// "freed_nodes": 100, +// "allocated_nodes": 10100 // Net increase +// } +// } +``` + +## 5. Production Memory Monitoring + +### Prometheus Metrics + +```typescript +// metrics.ts +import {Gauge} from 'prom-client'; + +const memoryUsageGauge = new Gauge({ + name: 'nodejs_memory_usage_bytes', + help: 'Node.js memory usage in bytes', + labelNames: ['type'] +}); + +setInterval(() => { + const usage = process.memoryUsage(); + memoryUsageGauge.set({type: 'heap_used'}, usage.heapUsed); + memoryUsageGauge.set({type: 'heap_total'}, usage.heapTotal); + memoryUsageGauge.set({type: 'external'}, usage.external); + memoryUsageGauge.set({type: 'rss'}, usage.rss); +}, 15000); +``` + +**Grafana Alert**: +```promql +# Alert if heap usage growing linearly +increase(nodejs_memory_usage_bytes{type="heap_used"}[1h]) > 100000000 # 100MB/hour +``` + +## 6. Real-World Fix: EventEmitter Leak + +### Before (Leaking) + +```typescript +// order-processor.ts (BEFORE FIX) +class OrderProcessor { + private emitter = new EventEmitter(); + + async processOrders() { + // ❌ LEAK: Listener added every call + this.emitter.on('order:created', async (order) => { + await this.sendConfirmationEmail(order); + await this.updateInventory(order); + }); + + const orders = await db.query.orders.findMany({status: 'pending'}); + for (const order of orders) { + this.emitter.emit('order:created', order); + } + } +} + +// Called every minute +setInterval(() => new OrderProcessor().processOrders(), 60000); +``` + +**Result**: 1,440 listeners/day → 2GB memory leak in production + +### After (Fixed) + +```typescript +// order-processor.ts (AFTER FIX) +class OrderProcessor { + private emitter = new EventEmitter(); + private listeners = new WeakMap(); // Track listeners for cleanup + + async processOrders() { + const handler = async (order) => { + await this.sendConfirmationEmail(order); + await this.updateInventory(order); + }; + + // ✅ Use once() for one-time processing + this.emitter.once('order:created', handler); + + const orders = await db.query.orders.findMany({status: 'pending'}); + for (const order of orders) { + this.emitter.emit('order:created', order); + } + + // ✅ Cleanup (if using on() instead of once()) + this.emitter.removeAllListeners('order:created'); + } +} +``` + +**Result**: Memory stable at 150MB, zero leaks + +## 7. Results and Impact + +### Before vs After Metrics + +| Metric | Before Fix | After Fix | Impact | +|--------|-----------|-----------|---------| +| **Memory Usage** | 2GB (after 6h) | 150MB (stable) | **93% reduction** | +| **Heap Size** | Linear growth (5MB/min) | Stable | **Zero growth** | +| **OOM Incidents** | 12/month | 0/month | **100% eliminated** | +| **GC Pause Time** | 200ms avg | 50ms avg | **75% faster** | +| **Uptime** | 6 hours avg | 30+ days | **120x improvement** | + +### Lessons Learned + +**1. Always remove event listeners** +- Use `once()` for one-time events +- Use `removeListener()` in finally blocks +- Track listeners with WeakMap for debugging + +**2. Avoid closures capturing large objects** +- Extract only needed data before closure +- Use WeakMap/WeakSet for object references +- Profile with heap snapshots regularly + +**3. Monitor memory in production** +- Prometheus metrics for heap usage +- Alert on linear growth patterns +- Weekly heap snapshot analysis + +## Related Documentation + +- **Python Profiling**: [python-scalene-profiling.md](python-scalene-profiling.md) +- **DB Leaks**: [database-connection-leak.md](database-connection-leak.md) +- **Reference**: [../reference/memory-patterns.md](../reference/memory-patterns.md) +- **Templates**: [../templates/memory-report.md](../templates/memory-report.md) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/memory-profiling/examples/python-scalene-profiling.md b/skills/memory-profiling/examples/python-scalene-profiling.md new file mode 100644 index 0000000..536efbf --- /dev/null +++ b/skills/memory-profiling/examples/python-scalene-profiling.md @@ -0,0 +1,456 @@ +# Python Memory Profiling with Scalene + +Line-by-line memory and CPU profiling for Python applications using Scalene, with pytest integration and optimization strategies. + +## Overview + +**Before Optimization**: +- Memory usage: 500MB for processing 10K records +- OOM (Out of Memory) errors with 100K records +- Processing time: 45 seconds for 10K records +- List comprehensions loading entire dataset + +**After Optimization**: +- Memory usage: 5MB for processing 10K records (99% reduction) +- No OOM errors with 1M records +- Processing time: 8 seconds for 10K records (82% faster) +- Generator-based streaming + +**Tools**: Scalene, pytest, memory_profiler, tracemalloc + +## 1. Scalene Installation and Setup + +### Installation + +```bash +# Install Scalene +pip install scalene + +# Or with uv (faster) +uv pip install scalene +``` + +### Basic Usage + +```bash +# Profile entire script +scalene script.py + +# Profile with pytest (recommended) +scalene --cli --memory -m pytest tests/ + +# HTML output +scalene --html --outfile profile.html script.py + +# Profile specific function +scalene --reduced-profile script.py +``` + +## 2. Profiling with pytest + +### Test File Setup + +```python +# tests/test_data_processing.py +import pytest +from data_processor import DataProcessor + +@pytest.fixture +def processor(): + return DataProcessor() + +def test_process_large_dataset(processor): + # Generate 10K records + records = [{'id': i, 'value': i * 2} for i in range(10000)] + + # Process (this is where memory spike occurs) + result = processor.process_records(records) + + assert len(result) == 10000 +``` + +### Running Scalene with pytest + +```bash +# Profile memory usage during test execution +uv run scalene --cli --memory -m pytest tests/test_data_processing.py 2>&1 | grep -i "memory\|mb\|test" + +# Output shows line-by-line memory allocation +``` + +**Scalene Output** (before optimization): +``` +data_processor.py: +Line | Memory % | Memory (MB) | CPU % | Code +-----|----------|-------------|-------|----- +12 | 45% | 225 MB | 10% | result = [transform(r) for r in records] +18 | 30% | 150 MB | 5% | filtered = [r for r in result if r['value'] > 0] +25 | 15% | 75 MB | 20% | sorted_data = sorted(filtered, key=lambda x: x['id']) +``` + +**Analysis**: Line 12 is the hotspot (45% of memory) + +## 3. Memory Hotspot Identification + +### Vulnerable Code (Memory Spike) + +```python +# data_processor.py (BEFORE OPTIMIZATION) +class DataProcessor: + def process_records(self, records: list[dict]) -> list[dict]: + # ❌ HOTSPOT: List comprehension loads entire dataset + result = [self.transform(r) for r in records] # 225MB for 10K records + + # ❌ Creates another copy + filtered = [r for r in result if r['value'] > 0] # +150MB + + # ❌ sorted() creates yet another copy + sorted_data = sorted(filtered, key=lambda x: x['id']) # +75MB + + return sorted_data # Total: 450MB for 10K records + + def transform(self, record: dict) -> dict: + return { + 'id': record['id'], + 'value': record['value'] * 2, + 'timestamp': datetime.now() + } +``` + +**Scalene Report**: +``` +Memory allocation breakdown: +- Line 12 (list comprehension): 225MB (50%) +- Line 18 (filtering): 150MB (33%) +- Line 25 (sorting): 75MB (17%) + +Total memory: 450MB for 10,000 records +Projected for 100K: 4.5GB → OOM! +``` + +### Optimized Code (Generator-Based) + +```python +# data_processor.py (AFTER OPTIMIZATION) +from typing import Iterator + +class DataProcessor: + def process_records(self, records: list[dict]) -> Iterator[dict]: + # ✅ Generator: processes one record at a time + transformed = (self.transform(r) for r in records) # O(1) memory + + # ✅ Generator chaining + filtered = (r for r in transformed if r['value'] > 0) # O(1) memory + + # ✅ Stream-based sorting (only if needed) + # For very large datasets, use external sorting or database ORDER BY + yield from sorted(filtered, key=lambda x: x['id']) # Still O(n), but lazy + + def transform(self, record: dict) -> dict: + return { + 'id': record['id'], + 'value': record['value'] * 2, + 'timestamp': datetime.now() + } + + # Alternative: Fully streaming (no sorting) + def process_records_streaming(self, records: list[dict]) -> Iterator[dict]: + for record in records: + transformed = self.transform(record) + if transformed['value'] > 0: + yield transformed # O(1) memory, fully streaming +``` + +**Scalene Report (After)**: +``` +Memory allocation breakdown: +- Line 12 (generator): 5MB (100% - constant overhead) +- Line 18 (filter generator): 0MB (lazy) +- Line 25 (yield): 0MB (lazy) + +Total memory: 5MB for 10,000 records (99% reduction!) +Scalable to 1M+ records without OOM +``` + +## 4. Common Memory Patterns + +### Pattern 1: List Comprehension → Generator + +**Before** (High Memory): +```python +# ❌ Loads entire list into memory +def process_large_file(filename: str) -> list[dict]: + with open(filename) as f: + lines = f.readlines() # Loads entire file (500MB) + + # Another copy + return [json.loads(line) for line in lines] # +500MB = 1GB total +``` + +**After** (Low Memory): +```python +# ✅ Generator: processes line-by-line +def process_large_file(filename: str) -> Iterator[dict]: + with open(filename) as f: + for line in f: # Reads one line at a time + yield json.loads(line) # O(1) memory +``` + +**Scalene diff**: 1GB → 5MB (99.5% reduction) + +### Pattern 2: DataFrame Memory Optimization + +**Before** (High Memory): +```python +# ❌ Loads entire CSV into memory +import pandas as pd + +def analyze_data(filename: str): + df = pd.read_csv(filename) # 10GB CSV → 10GB RAM + + # All transformations in memory + df['new_col'] = df['value'] * 2 + df_filtered = df[df['value'] > 0] + return df_filtered.groupby('category').sum() +``` + +**After** (Low Memory with Chunking): +```python +# ✅ Process in chunks +import pandas as pd + +def analyze_data(filename: str): + chunk_size = 10000 + results = [] + + # Process 10K rows at a time + for chunk in pd.read_csv(filename, chunksize=chunk_size): + chunk['new_col'] = chunk['value'] * 2 + filtered = chunk[chunk['value'] > 0] + group_result = filtered.groupby('category').sum() + results.append(group_result) + + # Combine results + return pd.concat(results).groupby(level=0).sum() # Much smaller +``` + +**Scalene diff**: 10GB → 500MB (95% reduction) + +### Pattern 3: String Concatenation + +**Before** (High Memory): +```python +# ❌ Creates new string each iteration (O(n²) memory) +def build_report(data: list[dict]) -> str: + report = "" + for item in data: # 100K items + report += f"{item['id']}: {item['value']}\n" # New string every time + return report # 500MB final string + 500MB garbage = 1GB +``` + +**After** (Low Memory): +```python +# ✅ StringIO or join (O(n) memory) +from io import StringIO + +def build_report(data: list[dict]) -> str: + buffer = StringIO() + for item in data: + buffer.write(f"{item['id']}: {item['value']}\n") + return buffer.getvalue() + +# Or even better: generator +def build_report_streaming(data: list[dict]) -> Iterator[str]: + for item in data: + yield f"{item['id']}: {item['value']}\n" +``` + +**Scalene diff**: 1GB → 50MB (95% reduction) + +## 5. Scalene CLI Reference + +### Common Options + +```bash +# Memory-only profiling (fastest) +scalene --cli --memory script.py + +# CPU + Memory profiling +scalene --cli --cpu --memory script.py + +# Reduced profile (functions only, not lines) +scalene --reduced-profile script.py + +# Profile specific function +scalene --profile-only process_data script.py + +# HTML report +scalene --html --outfile profile.html script.py + +# Profile with pytest +scalene --cli --memory -m pytest tests/ + +# Set memory sampling interval (default: 1MB) +scalene --malloc-threshold 0.1 script.py # Sample every 100KB +``` + +### Interpreting Output + +**Column Meanings**: +``` +Memory % | Percentage of total memory allocated +Memory MB | Absolute memory allocated (in megabytes) +CPU % | Percentage of CPU time spent +Python % | Time spent in Python (vs native code) +``` + +**Example Output**: +``` +script.py: +Line | Memory % | Memory MB | CPU % | Python % | Code +-----|----------|-----------|-------|----------|----- +12 | 45.2% | 225.6 MB | 10.5% | 95.2% | data = [x for x in range(1000000)] +18 | 30.1% | 150.3 MB | 5.2% | 98.1% | filtered = list(filter(lambda x: x > 0, data)) +``` + +**Analysis**: +- Line 12: High memory (45.2%) → optimize list comprehension +- Line 18: Moderate memory (30.1%) → use generator instead of list() + +## 6. Integration with CI/CD + +### GitHub Actions Workflow + +```yaml +# .github/workflows/memory-profiling.yml +name: Memory Profiling + +on: [pull_request] + +jobs: + profile: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + pip install scalene pytest + + - name: Run memory profiling + run: | + scalene --cli --memory --reduced-profile -m pytest tests/ > profile.txt + + - name: Check for memory hotspots + run: | + if grep -q "Memory %" profile.txt; then + # Alert if any line uses >100MB + if awk '$3 > 100 {exit 1}' profile.txt; then + echo "Memory hotspot detected!" + exit 1 + fi + fi + + - name: Upload profile + uses: actions/upload-artifact@v3 + with: + name: memory-profile + path: profile.txt +``` + +## 7. Real-World Optimization: CSV Processing + +### Before (500MB Memory, OOM at 100K rows) + +```python +# csv_processor.py (BEFORE) +import pandas as pd + +class CSVProcessor: + def process_file(self, filename: str) -> dict: + # ❌ Loads entire CSV + df = pd.read_csv(filename) # 500MB for 10K rows + + # ❌ Multiple copies + df['total'] = df['quantity'] * df['price'] + df_filtered = df[df['total'] > 100] + summary = df_filtered.groupby('category').agg({ + 'total': 'sum', + 'quantity': 'sum' + }) + + return summary.to_dict() +``` + +**Scalene Output**: +``` +Line 8: 500MB (75%) - pd.read_csv() +Line 11: 100MB (15%) - df['total'] calculation +Line 12: 50MB (10%) - filtering +Total: 650MB for 10K rows +``` + +### After (5MB Memory, Handles 1M rows) + +```python +# csv_processor.py (AFTER) +import pandas as pd +from collections import defaultdict + +class CSVProcessor: + def process_file(self, filename: str) -> dict: + # ✅ Process in 10K row chunks + chunk_size = 10000 + results = defaultdict(lambda: {'total': 0, 'quantity': 0}) + + for chunk in pd.read_csv(filename, chunksize=chunk_size): + chunk['total'] = chunk['quantity'] * chunk['price'] + filtered = chunk[chunk['total'] > 100] + + # Aggregate incrementally + for category, group in filtered.groupby('category'): + results[category]['total'] += group['total'].sum() + results[category]['quantity'] += group['quantity'].sum() + + return dict(results) +``` + +**Scalene Output (After)**: +``` +Line 9: 5MB (100%) - chunk processing (constant memory) +Total: 5MB for any file size (99% reduction) +``` + +## 8. Results and Impact + +### Before vs After Metrics + +| Metric | Before | After | Impact | +|--------|--------|-------|--------| +| **Memory Usage** | 500MB (10K rows) | 5MB (1M rows) | **99% reduction** | +| **Processing Time** | 45s (10K rows) | 8s (10K rows) | **82% faster** | +| **Max File Size** | 100K rows (OOM) | 10M+ rows | **100x scalability** | +| **OOM Errors** | 5/week | 0/month | **100% eliminated** | + +### Key Optimizations Applied + +1. **List comprehension → Generator**: 225MB → 0MB +2. **DataFrame chunking**: 500MB → 5MB per chunk +3. **String concatenation**: 1GB → 50MB (StringIO) +4. **Lazy evaluation**: Load on demand vs load all + +## Related Documentation + +- **Node.js Leaks**: [nodejs-memory-leak.md](nodejs-memory-leak.md) +- **DB Leaks**: [database-connection-leak.md](database-connection-leak.md) +- **Reference**: [../reference/profiling-tools.md](../reference/profiling-tools.md) +- **Templates**: [../templates/scalene-config.txt](../templates/scalene-config.txt) + +--- + +Return to [examples index](INDEX.md) diff --git a/skills/memory-profiling/reference/INDEX.md b/skills/memory-profiling/reference/INDEX.md new file mode 100644 index 0000000..3d01f5a --- /dev/null +++ b/skills/memory-profiling/reference/INDEX.md @@ -0,0 +1,75 @@ +# Memory Profiler Reference + +Quick reference guides for memory optimization patterns, profiling tools, and garbage collection. + +## Reference Guides + +### Memory Optimization Patterns + +**File**: [memory-optimization-patterns.md](memory-optimization-patterns.md) + +Comprehensive catalog of memory leak patterns and their fixes: +- **Event Listener Leaks**: EventEmitter cleanup, closure traps +- **Connection Pool Leaks**: Database connection management +- **Large Dataset Patterns**: Streaming, chunking, lazy evaluation +- **Cache Management**: LRU caches, WeakMap/WeakSet +- **Closure Memory Traps**: Variable capture, scope management + +**Use when**: Quick lookup for specific memory leak pattern + +--- + +### Profiling Tools Comparison + +**File**: [profiling-tools.md](profiling-tools.md) + +Comparison matrix and usage guide for memory profiling tools: +- **Node.js**: Chrome DevTools, heapdump, memwatch-next, clinic.js +- **Python**: Scalene, memory_profiler, tracemalloc, py-spy +- **Monitoring**: Prometheus, Grafana, DataDog APM +- **Tool Selection**: When to use which tool + +**Use when**: Choosing the right profiling tool for your stack + +--- + +### Garbage Collection Guide + +**File**: [garbage-collection-guide.md](garbage-collection-guide.md) + +Understanding and tuning garbage collectors: +- **V8 (Node.js)**: Generational GC, heap structure, --max-old-space-size +- **Python**: Reference counting, generational GC, gc.collect() +- **GC Monitoring**: Metrics, alerts, optimization +- **GC Tuning**: When and how to tune + +**Use when**: GC issues, tuning performance, understanding memory behavior + +--- + +## Quick Lookup + +**Common Patterns**: +- EventEmitter leak → [memory-optimization-patterns.md#event-listener-leaks](memory-optimization-patterns.md#event-listener-leaks) +- Connection leak → [memory-optimization-patterns.md#connection-pool-leaks](memory-optimization-patterns.md#connection-pool-leaks) +- Large dataset → [memory-optimization-patterns.md#large-dataset-patterns](memory-optimization-patterns.md#large-dataset-patterns) + +**Tool Selection**: +- Node.js profiling → [profiling-tools.md#nodejs-tools](profiling-tools.md#nodejs-tools) +- Python profiling → [profiling-tools.md#python-tools](profiling-tools.md#python-tools) +- Production monitoring → [profiling-tools.md#monitoring-tools](profiling-tools.md#monitoring-tools) + +**GC Issues**: +- Node.js heap → [garbage-collection-guide.md#v8-heap](garbage-collection-guide.md#v8-heap) +- Python GC → [garbage-collection-guide.md#python-gc](garbage-collection-guide.md#python-gc) +- GC metrics → [garbage-collection-guide.md#gc-monitoring](garbage-collection-guide.md#gc-monitoring) + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Full walkthroughs +- **Templates**: [Templates Index](../templates/INDEX.md) - Memory report templates +- **Main Agent**: [memory-profiler.md](../memory-profiler.md) - Memory profiler agent + +--- + +Return to [main agent](../memory-profiler.md) diff --git a/skills/memory-profiling/reference/garbage-collection-guide.md b/skills/memory-profiling/reference/garbage-collection-guide.md new file mode 100644 index 0000000..fed2b4b --- /dev/null +++ b/skills/memory-profiling/reference/garbage-collection-guide.md @@ -0,0 +1,392 @@ +# Garbage Collection Guide + +Understanding and tuning garbage collectors in Node.js (V8) and Python for optimal memory management. + +## V8 Garbage Collector (Node.js) + +### Heap Structure + +**Two Generations**: +``` +┌─────────────────────────────────────────────────────────┐ +│ V8 Heap │ +├─────────────────────────────────────────────────────────┤ +│ New Space (Young Generation) - 8MB-32MB │ +│ ┌─────────────┬─────────────┐ │ +│ │ From-Space │ To-Space │ ← Minor GC (Scavenge) │ +│ └─────────────┴─────────────┘ │ +│ │ +│ Old Space (Old Generation) - Remaining heap │ +│ ┌──────────────────────────────────────┐ │ +│ │ Long-lived objects │ ← Major GC │ +│ │ (survived 2+ Minor GCs) │ (Mark-Sweep)│ +│ └──────────────────────────────────────┘ │ +│ │ +│ Large Object Space - Objects >512KB │ +└─────────────────────────────────────────────────────────┘ +``` + +**GC Types**: +- **Scavenge (Minor GC)**: Fast (~1ms), clears new space, runs frequently +- **Mark-Sweep (Major GC)**: Slow (100-500ms), clears old space, runs when old space fills +- **Mark-Compact**: Like Mark-Sweep but also defragments memory + +--- + +### Monitoring V8 GC + +**Built-in GC Traces**: +```bash +# Enable GC logging +node --trace-gc server.js + +# Output: +# [12345:0x104800000] 42 ms: Scavenge 8.5 (10.2) -> 7.8 (10.2) MB +# [12345:0x104800000] 123 ms: Mark-sweep 95.2 (100.5) -> 82.3 (100.5) MB +``` + +**Parse GC logs**: +``` +[PID:address] time ms: GC-type before (heap) -> after (heap) MB + +Scavenge = Minor GC (young generation) +Mark-sweep = Major GC (old generation) +``` + +**Prometheus Metrics**: +```typescript +import { Gauge } from 'prom-client'; +import v8 from 'v8'; + +const heap_size = new Gauge({ name: 'nodejs_heap_size_total_bytes' }); +const heap_used = new Gauge({ name: 'nodejs_heap_used_bytes' }); +const gc_duration = new Histogram({ + name: 'nodejs_gc_duration_seconds', + labelNames: ['kind'] +}); + +// Track GC events +const PerformanceObserver = require('perf_hooks').PerformanceObserver; +const obs = new PerformanceObserver((list) => { + const entry = list.getEntries()[0]; + gc_duration.labels(entry.kind).observe(entry.duration / 1000); +}); +obs.observe({ entryTypes: ['gc'] }); + +// Update heap metrics every 10s +setInterval(() => { + const stats = v8.getHeapStatistics(); + heap_size.set(stats.total_heap_size); + heap_used.set(stats.used_heap_size); +}, 10000); +``` + +--- + +### V8 GC Tuning + +**Heap Size Limits**: +```bash +# Default: ~1.4GB on 64-bit systems +# Increase max heap size +node --max-old-space-size=4096 server.js # 4GB heap + +# For containers (set to 75% of container memory) +# 8GB container → --max-old-space-size=6144 +``` + +**GC Optimization Flags**: +```bash +# Aggressive GC (lower memory, more CPU) +node --optimize-for-size --gc-interval=100 server.js + +# Optimize for throughput (higher memory, less CPU) +node --max-old-space-size=8192 server.js + +# Expose GC to JavaScript +node --expose-gc server.js +# Then: global.gc() to force GC +``` + +**When to tune**: +- ✅ Container memory limits (set heap to 75% of limit) +- ✅ Frequent Major GC causing latency spikes +- ✅ OOM errors with available memory +- ❌ Don't tune as first step (fix leaks first!) + +--- + +## Python Garbage Collector + +### GC Mechanism + +**Two Systems**: +1. **Reference Counting**: Primary mechanism, immediate cleanup when refcount = 0 +2. **Generational GC**: Handles circular references + +**Generational Structure**: +``` +┌─────────────────────────────────────────────────────────┐ +│ Python GC (Generational) │ +├─────────────────────────────────────────────────────────┤ +│ Generation 0 (Young) - Threshold: 700 objects │ +│ ├─ New objects │ +│ └─ Collected most frequently │ +│ │ +│ Generation 1 (Middle) - Threshold: 10 collections │ +│ ├─ Survived 1 Gen0 collection │ +│ └─ Collected less frequently │ +│ │ +│ Generation 2 (Old) - Threshold: 10 collections │ +│ ├─ Survived Gen1 collection │ +│ └─ Collected rarely │ +└─────────────────────────────────────────────────────────┘ +``` + +--- + +### Monitoring Python GC + +**GC Statistics**: +```python +import gc + +# Get GC stats +print(gc.get_stats()) +# [{'collections': 42, 'collected': 123, 'uncollectable': 0}, ...] + +# Get object count by generation +print(gc.get_count()) +# (45, 3, 1) = (gen0, gen1, gen2) object counts + +# Get thresholds +print(gc.get_threshold()) +# (700, 10, 10) = collect when gen0 has 700 objects, etc. +``` + +**Track GC Pauses**: +```python +import gc +import time + +class GCMonitor: + def __init__(self): + self.start_time = None + + def on_gc_start(self, phase, info): + self.start_time = time.time() + + def on_gc_finish(self, phase, info): + duration = time.time() - self.start_time + print(f"GC {phase}: {duration*1000:.1f}ms, collected {info['collected']}") + +# Install callbacks +gc.callbacks.append(GCMonitor().on_gc_start) +``` + +**Prometheus Metrics**: +```python +from prometheus_client import Gauge, Histogram +import gc + +gc_collections = Gauge('python_gc_collections_total', 'GC collections', ['generation']) +gc_collected = Gauge('python_gc_objects_collected_total', 'Objects collected', ['generation']) +gc_duration = Histogram('python_gc_duration_seconds', 'GC duration', ['generation']) + +def record_gc_metrics(): + stats = gc.get_stats() + for gen, stat in enumerate(stats): + gc_collections.labels(generation=gen).set(stat['collections']) + gc_collected.labels(generation=gen).set(stat['collected']) +``` + +--- + +### Python GC Tuning + +**Disable GC (for batch jobs)**: +```python +import gc + +# Disable automatic GC +gc.disable() + +# Process large dataset without GC pauses +for chunk in large_dataset: + process(chunk) + +# Manual GC at end +gc.collect() +``` + +**Adjust Thresholds**: +```python +import gc + +# Default: (700, 10, 10) +# More aggressive: collect more often, lower memory +gc.set_threshold(400, 5, 5) + +# Less aggressive: collect less often, higher memory but faster +gc.set_threshold(1000, 15, 15) +``` + +**Debug Circular References**: +```python +import gc + +# Find objects that can't be collected +gc.set_debug(gc.DEBUG_SAVEALL) +gc.collect() + +print(f"Uncollectable: {len(gc.garbage)}") +for obj in gc.garbage: + print(type(obj), obj) +``` + +**When to tune**: +- ✅ Batch jobs: disable GC, manual collect at end +- ✅ Real-time systems: adjust thresholds to avoid long pauses +- ✅ Debugging: use `DEBUG_SAVEALL` to find leaks +- ❌ Don't disable GC in long-running services (memory will grow!) + +--- + +## GC-Related Memory Issues + +### Issue 1: Long GC Pauses + +**Symptom**: Request latency spikes every few minutes + +**V8 Fix**: +```bash +# Monitor GC pauses +node --trace-gc server.js 2>&1 | grep "Mark-sweep" + +# If Major GC >500ms, increase heap size +node --max-old-space-size=4096 server.js +``` + +**Python Fix**: +```python +# Disable GC during request handling +import gc +gc.disable() + +# Periodic manual GC (in background thread) +import threading +def periodic_gc(): + while True: + time.sleep(60) + gc.collect() +threading.Thread(target=periodic_gc, daemon=True).start() +``` + +--- + +### Issue 2: Frequent Minor GC + +**Symptom**: High CPU from constant minor GC + +**Cause**: Too many short-lived objects + +**Fix**: Reduce allocations +```python +# ❌ BAD: Creates many temporary objects +def process_data(items): + return [str(i) for i in items] # New list + strings + +# ✅ BETTER: Generator (no intermediate list) +def process_data(items): + return (str(i) for i in items) +``` + +--- + +### Issue 3: Memory Not Released After GC + +**Symptom**: Heap usage high even after GC + +**V8 Cause**: Objects in old generation (major GC needed) +```bash +# Force full GC to reclaim memory +node --expose-gc server.js + +# In code: +if (global.gc) global.gc(); +``` + +**Python Cause**: Reference cycles +```python +# Debug reference cycles +import gc +import sys + +# Find what's keeping object alive +obj = my_object +print(sys.getrefcount(obj)) # Should be low + +# Get referrers +print(gc.get_referrers(obj)) +``` + +--- + +## GC Alerts (Prometheus) + +```yaml +# Prometheus alert rules +groups: + - name: gc_alerts + rules: + # V8: Major GC taking too long + - alert: SlowMajorGC + expr: nodejs_gc_duration_seconds{kind="major"} > 0.5 + for: 5m + annotations: + summary: "Major GC >500ms ({{ $value }}s)" + + # V8: High GC frequency + - alert: FrequentGC + expr: rate(nodejs_gc_duration_seconds_count[5m]) > 10 + for: 10m + annotations: + summary: "GC running >10x/min" + + # Python: High Gen2 collections + - alert: FrequentFullGC + expr: rate(python_gc_collections_total{generation="2"}[1h]) > 1 + for: 1h + annotations: + summary: "Full GC >1x/hour (potential leak)" +``` + +--- + +## Best Practices + +### V8 (Node.js) + +1. **Set heap size**: `--max-old-space-size` to 75% of container memory +2. **Monitor GC**: Track duration and frequency with Prometheus +3. **Alert on slow GC**: Major GC >500ms indicates heap too small or memory leak +4. **Don't force GC**: Let V8 manage (except for tests/debugging) + +### Python + +1. **Use reference counting**: Most cleanup is automatic (refcount = 0) +2. **Avoid circular refs**: Use `weakref` for back-references +3. **Batch jobs**: Disable GC, manual `gc.collect()` at end +4. **Monitor Gen2**: Frequent Gen2 collections = potential leak + +--- + +## Related Documentation + +- **Patterns**: [memory-optimization-patterns.md](memory-optimization-patterns.md) +- **Tools**: [profiling-tools.md](profiling-tools.md) +- **Examples**: [Examples Index](../examples/INDEX.md) + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/memory-profiling/reference/memory-optimization-patterns.md b/skills/memory-profiling/reference/memory-optimization-patterns.md new file mode 100644 index 0000000..ba37f35 --- /dev/null +++ b/skills/memory-profiling/reference/memory-optimization-patterns.md @@ -0,0 +1,371 @@ +# Memory Optimization Patterns Reference + +Quick reference catalog of common memory leak patterns and their fixes. + +## Event Listener Leaks + +### Pattern: EventEmitter Accumulation + +**Symptom**: Memory grows linearly with time/requests +**Cause**: Event listeners added but never removed + +**Vulnerable**: +```typescript +// ❌ LEAK: listener added every call +class DataProcessor { + private emitter = new EventEmitter(); + + async process() { + this.emitter.on('data', handler); // Never removed + } +} +``` + +**Fixed**: +```typescript +// ✅ FIX 1: Remove listener +this.emitter.on('data', handler); +try { /* work */ } finally { + this.emitter.removeListener('data', handler); +} + +// ✅ FIX 2: Use once() +this.emitter.once('data', handler); // Auto-removed + +// ✅ FIX 3: Use AbortController +const controller = new AbortController(); +this.emitter.on('data', handler, { signal: controller.signal }); +controller.abort(); // Removes listener +``` + +**Detection**: +```typescript +// Check listener count +console.log(emitter.listenerCount('data')); // Should be constant + +// Monitor in production +process.on('warning', (warning) => { + if (warning.name === 'MaxListenersExceededWarning') { + console.error('Listener leak detected:', warning); + } +}); +``` + +--- + +## Closure Memory Traps + +### Pattern: Captured Variables in Closures + +**Symptom**: Memory not released after scope exits +**Cause**: Closure captures large variables + +**Vulnerable**: +```typescript +// ❌ LEAK: Closure captures entire 1GB buffer +function createHandler(largeBuffer: Buffer) { + return function handler() { + // Only uses buffer.length, but captures entire buffer + console.log(largeBuffer.length); + }; +} +``` + +**Fixed**: +```typescript +// ✅ FIX: Extract only what's needed +function createHandler(largeBuffer: Buffer) { + const length = largeBuffer.length; // Extract value + return function handler() { + console.log(length); // Only captures number, not Buffer + }; +} +``` + +--- + +## Connection Pool Leaks + +### Pattern: Unclosed Database Connections + +**Symptom**: Pool exhaustion, connection timeouts +**Cause**: Connections acquired but not released + +**Vulnerable**: +```python +# ❌ LEAK: Connection never closed on exception +def get_orders(): + conn = pool.acquire() + orders = conn.execute("SELECT * FROM orders") + return orders # conn never released +``` + +**Fixed**: +```python +# ✅ FIX: Context manager guarantees cleanup +def get_orders(): + with pool.acquire() as conn: + orders = conn.execute("SELECT * FROM orders") + return orders # conn auto-released +``` + +--- + +## Large Dataset Patterns + +### Pattern 1: Loading Entire File into Memory + +**Vulnerable**: +```python +# ❌ LEAK: 10GB file → 20GB RAM +df = pd.read_csv("large.csv") +``` + +**Fixed**: +```python +# ✅ FIX: Chunking +for chunk in pd.read_csv("large.csv", chunksize=10000): + process(chunk) # Constant memory + +# ✅ BETTER: Polars streaming +df = pl.scan_csv("large.csv").collect(streaming=True) +``` + +### Pattern 2: List Comprehension vs Generator + +**Vulnerable**: +```python +# ❌ LEAK: Entire list in memory +result = [process(item) for item in huge_list] +``` + +**Fixed**: +```python +# ✅ FIX: Generator (lazy evaluation) +result = (process(item) for item in huge_list) +for item in result: + use(item) # Processes one at a time +``` + +--- + +## Cache Management + +### Pattern: Unbounded Cache Growth + +**Vulnerable**: +```typescript +// ❌ LEAK: Cache grows forever +const cache = new Map(); + +function getData(key: string) { + if (!cache.has(key)) { + cache.set(key, fetchData(key)); // Never evicted + } + return cache.get(key); +} +``` + +**Fixed**: +```typescript +// ✅ FIX 1: LRU cache with max size +import { LRUCache } from 'lru-cache'; + +const cache = new LRUCache({ + max: 1000, // Max 1000 entries + ttl: 1000 * 60 * 5 // 5 minute TTL +}); + +// ✅ FIX 2: WeakMap (auto-cleanup when key GC'd) +const cache = new WeakMap(); +cache.set(key, data); // Auto-removed when key is GC'd +``` + +--- + +## Timer and Interval Leaks + +### Pattern: Forgotten Timers + +**Vulnerable**: +```typescript +// ❌ LEAK: Timer never cleared +class Component { + startPolling() { + setInterval(() => { + this.fetchData(); // Keeps Component alive forever + }, 1000); + } +} +``` + +**Fixed**: +```typescript +// ✅ FIX: Clear timer on cleanup +class Component { + private intervalId?: NodeJS.Timeout; + + startPolling() { + this.intervalId = setInterval(() => { + this.fetchData(); + }, 1000); + } + + cleanup() { + if (this.intervalId) { + clearInterval(this.intervalId); + } + } +} +``` + +--- + +## Global Variable Accumulation + +### Pattern: Growing Global Arrays + +**Vulnerable**: +```typescript +// ❌ LEAK: Array grows forever +const logs: string[] = []; + +function log(message: string) { + logs.push(message); // Never cleared +} +``` + +**Fixed**: +```typescript +// ✅ FIX 1: Bounded array +const MAX_LOGS = 1000; +const logs: string[] = []; + +function log(message: string) { + logs.push(message); + if (logs.length > MAX_LOGS) { + logs.shift(); // Remove oldest + } +} + +// ✅ FIX 2: Circular buffer +import { CircularBuffer } from 'circular-buffer'; +const logs = new CircularBuffer(1000); +``` + +--- + +## String Concatenation + +### Pattern: Repeated String Concatenation + +**Vulnerable**: +```python +# ❌ LEAK: Creates new string each iteration (O(n²)) +result = "" +for item in items: + result += str(item) # New string allocation +``` + +**Fixed**: +```python +# ✅ FIX 1: Join +result = "".join(str(item) for item in items) + +# ✅ FIX 2: StringIO +from io import StringIO +buffer = StringIO() +for item in items: + buffer.write(str(item)) +result = buffer.getvalue() +``` + +--- + +## React Component Leaks + +### Pattern: setState After Unmount + +**Vulnerable**: +```typescript +// ❌ LEAK: setState called after unmount +function Component() { + const [data, setData] = useState(null); + + useEffect(() => { + fetchData().then(setData); // If unmounted, causes leak + }, []); +} +``` + +**Fixed**: +```typescript +// ✅ FIX: Cleanup with AbortController +function Component() { + const [data, setData] = useState(null); + + useEffect(() => { + const controller = new AbortController(); + + fetchData(controller.signal).then(setData); + + return () => controller.abort(); // Cleanup + }, []); +} +``` + +--- + +## Detection Patterns + +### Memory Leak Indicators + +1. **Linear growth**: Memory usage increases linearly with time/requests +2. **Pool exhaustion**: Connection pool hits max size +3. **EventEmitter warnings**: "MaxListenersExceededWarning" +4. **GC pressure**: Frequent/long GC pauses +5. **OOM errors**: Process crashes with "JavaScript heap out of memory" + +### Monitoring Metrics + +```typescript +// Prometheus metrics for leak detection +const heap_used = new Gauge({ + name: 'nodejs_heap_used_bytes', + help: 'V8 heap used bytes' +}); + +const event_listeners = new Gauge({ + name: 'event_listeners_total', + help: 'Total event listeners', + labelNames: ['event'] +}); + +// Alert if heap grows >10% per hour +// Alert if listener count >100 for single event +``` + +--- + +## Quick Fixes Checklist + +- [ ] **Event listeners**: Use `once()` or `removeListener()` +- [ ] **Database connections**: Use context managers or `try/finally` +- [ ] **Large datasets**: Use chunking or streaming +- [ ] **Caches**: Implement LRU or WeakMap +- [ ] **Timers**: Clear with `clearInterval()` or `clearTimeout()` +- [ ] **Closures**: Extract values, avoid capturing large objects +- [ ] **React**: Cleanup in `useEffect()` return +- [ ] **Strings**: Use `join()` or `StringIO`, not `+=` + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) +- **Tools**: [profiling-tools.md](profiling-tools.md) +- **GC**: [garbage-collection-guide.md](garbage-collection-guide.md) + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/memory-profiling/reference/profiling-tools.md b/skills/memory-profiling/reference/profiling-tools.md new file mode 100644 index 0000000..ff204cc --- /dev/null +++ b/skills/memory-profiling/reference/profiling-tools.md @@ -0,0 +1,407 @@ +# Memory Profiling Tools Comparison + +Quick reference for choosing and using memory profiling tools across Node.js, Python, and production monitoring. + +## Node.js Tools + +### Chrome DevTools (Built-in) + +**Best for**: Interactive heap snapshot analysis, timeline profiling +**Cost**: Free (built into Node.js) + +**Usage**: +```bash +# Start Node.js with inspector +node --inspect server.js + +# Open chrome://inspect +# Click "Open dedicated DevTools for Node" +``` + +**Features**: +- Heap snapshots (memory state at point in time) +- Timeline recording (allocations over time) +- Comparison view (find leaks by comparing snapshots) +- Retainer paths (why object not GC'd) + +**When to use**: +- Development/staging environments +- Interactive debugging sessions +- Visual leak analysis + +--- + +### heapdump (npm package) + +**Best for**: Production heap snapshots without restarts +**Cost**: Free (npm package) + +**Usage**: +```typescript +import heapdump from 'heapdump'; + +// Trigger snapshot on signal +process.on('SIGUSR2', () => { + heapdump.writeSnapshot((err, filename) => { + console.log('Heap dump written to', filename); + }); +}); + +// Auto-snapshot on OOM +heapdump.writeSnapshot('./oom-' + Date.now() + '.heapsnapshot'); +``` + +**When to use**: +- Production memory leak diagnosis +- Scheduled snapshots (daily/weekly) +- OOM analysis (capture before crash) + +--- + +### clinic.js (Comprehensive Suite) + +**Best for**: All-in-one performance profiling +**Cost**: Free (open source) + +**Usage**: +```bash +# Install +npm install -g clinic + +# Memory profiling +clinic heapprofiler -- node server.js + +# Generates interactive HTML report +``` + +**Features**: +- Heap profiler (memory allocations) +- Flame graphs (CPU + memory) +- Timeline visualization +- Automatic leak detection + +**When to use**: +- Initial performance investigation +- Comprehensive profiling (CPU + memory) +- Team-friendly reports (HTML) + +--- + +### memwatch-next + +**Best for**: Real-time leak detection in production +**Cost**: Free (npm package) + +**Usage**: +```typescript +import memwatch from '@airbnb/node-memwatch'; + +memwatch.on('leak', (info) => { + console.error('Memory leak detected:', info); + // Alert, log, snapshot, etc. +}); + +memwatch.on('stats', (stats) => { + console.log('GC stats:', stats); +}); +``` + +**When to use**: +- Production leak monitoring +- Automatic alerting +- GC pressure tracking + +--- + +## Python Tools + +### Scalene (Line-by-Line Profiler) + +**Best for**: Fastest, most detailed Python profiler +**Cost**: Free (pip package) + +**Usage**: +```bash +# Install +pip install scalene + +# Profile script +scalene script.py + +# Profile with pytest +scalene --cli --memory -m pytest tests/ + +# HTML report +scalene --html --outfile profile.html script.py +``` + +**Features**: +- Line-by-line memory allocation +- CPU profiling +- GPU profiling +- Native code vs Python time +- Memory timeline + +**When to use**: +- Python memory optimization +- Line-level bottleneck identification +- pytest integration + +--- + +### memory_profiler + +**Best for**: Simple decorator-based profiling +**Cost**: Free (pip package) + +**Usage**: +```python +from memory_profiler import profile + +@profile +def my_function(): + a = [1] * (10 ** 6) + b = [2] * (2 * 10 ** 7) + return a + b + +# Run with: python -m memory_profiler script.py +``` + +**When to use**: +- Quick function-level profiling +- Simple memory debugging +- Educational/learning + +--- + +### tracemalloc (Built-in) + +**Best for**: Production memory tracking without dependencies +**Cost**: Free (Python standard library) + +**Usage**: +```python +import tracemalloc + +tracemalloc.start() + +# Your code here + +current, peak = tracemalloc.get_traced_memory() +print(f"Current: {current / 1024 / 1024:.1f} MB") +print(f"Peak: {peak / 1024 / 1024:.1f} MB") + +# Top allocations +snapshot = tracemalloc.take_snapshot() +top_stats = snapshot.statistics('lineno') +for stat in top_stats[:10]: + print(stat) + +tracemalloc.stop() +``` + +**When to use**: +- Production environments (no external dependencies) +- Allocation tracking +- Top allocators identification + +--- + +### py-spy (Sampling Profiler) + +**Best for**: Zero-overhead production profiling +**Cost**: Free (cargo/pip package) + +**Usage**: +```bash +# Install +pip install py-spy + +# Attach to running process (no code changes!) +py-spy top --pid 12345 + +# Flame graph +py-spy record --pid 12345 --output profile.svg +``` + +**When to use**: +- Production profiling (minimal overhead) +- No code modification required +- Running process analysis + +--- + +## Monitoring Tools + +### Prometheus + Grafana + +**Best for**: Production metrics and alerting +**Cost**: Free (open source) + +**Metrics to track**: +```typescript +import { Gauge, Histogram } from 'prom-client'; + +// Heap usage +const heap_used = new Gauge({ + name: 'nodejs_heap_used_bytes', + help: 'V8 heap used bytes' +}); + +// Memory allocation rate +const allocation_rate = new Gauge({ + name: 'memory_allocation_bytes_per_second', + help: 'Memory allocation rate' +}); + +// Connection pool +const pool_active = new Gauge({ + name: 'db_pool_connections_active', + help: 'Active database connections' +}); +``` + +**Alerts**: +```yaml +# Prometheus alert rules +groups: + - name: memory_alerts + rules: + - alert: MemoryLeak + expr: increase(nodejs_heap_used_bytes[1h]) > 100000000 # +100MB/hour + for: 6h + annotations: + summary: "Potential memory leak ({{ $value | humanize }} growth)" + + - alert: HeapNearLimit + expr: nodejs_heap_used_bytes / nodejs_heap_size_bytes > 0.9 + for: 5m + annotations: + summary: "Heap usage >90%" +``` + +**When to use**: +- Production monitoring (all environments) +- Long-term trend analysis +- Automatic alerting + +--- + +### DataDog APM + +**Best for**: Comprehensive observability platform +**Cost**: Paid (starts $15/host/month) + +**Features**: +- Automatic heap tracking +- Memory leak detection +- Distributed tracing +- Alert management +- Dashboards + +**When to use**: +- Enterprise environments +- Multi-service tracing +- Managed solution preferred + +--- + +## Tool Selection Matrix + +| Scenario | Node.js Tool | Python Tool | Monitoring | +|----------|-------------|-------------|------------| +| **Development debugging** | Chrome DevTools | Scalene | - | +| **Production leak** | heapdump | py-spy | Prometheus | +| **Line-level analysis** | clinic.js | Scalene | - | +| **Real-time monitoring** | memwatch-next | tracemalloc | Grafana | +| **Zero overhead** | - | py-spy | DataDog | +| **No dependencies** | Chrome DevTools | tracemalloc | - | +| **Team reports** | clinic.js | Scalene HTML | Grafana | + +--- + +## Quick Start Commands + +### Node.js + +```bash +# Development: Chrome DevTools +node --inspect server.js + +# Production: Heap snapshot +kill -USR2 # If heapdump configured + +# Comprehensive: clinic.js +clinic heapprofiler -- node server.js +``` + +### Python + +```bash +# Line-by-line: Scalene +scalene --cli --memory script.py + +# Quick profile: memory_profiler +python -m memory_profiler script.py + +# Production: py-spy +py-spy top --pid +``` + +### Monitoring + +```bash +# Prometheus metrics +curl http://localhost:9090/metrics | grep memory + +# Grafana dashboard +# Import dashboard ID: 11159 (Node.js) +# Import dashboard ID: 7362 (Python) +``` + +--- + +## Tool Comparison Table + +| Tool | Language | Type | Overhead | Production-Safe | Interactive | +|------|----------|------|----------|----------------|-------------| +| **Chrome DevTools** | Node.js | Heap snapshot | Low | No | Yes | +| **heapdump** | Node.js | Heap snapshot | Low | Yes | No | +| **clinic.js** | Node.js | Profiler | Medium | No | Yes | +| **memwatch-next** | Node.js | Real-time | Low | Yes | No | +| **Scalene** | Python | Profiler | Low | Staging | Yes | +| **memory_profiler** | Python | Decorator | Medium | No | No | +| **tracemalloc** | Python | Built-in | Low | Yes | No | +| **py-spy** | Python | Sampling | Very Low | Yes | No | +| **Prometheus** | Both | Metrics | Very Low | Yes | Yes (Grafana) | +| **DataDog** | Both | APM | Very Low | Yes | Yes | + +--- + +## Best Practices + +### Development Workflow + +1. **Initial investigation**: Chrome DevTools (Node.js) or Scalene (Python) +2. **Line-level analysis**: clinic.js or Scalene with `--html` +3. **Root cause**: Heap snapshot comparison (DevTools) +4. **Validation**: Load testing with monitoring + +### Production Workflow + +1. **Detection**: Prometheus alerts (heap growth, pool exhaustion) +2. **Diagnosis**: heapdump snapshot or py-spy sampling +3. **Analysis**: Chrome DevTools (load snapshot) or Scalene (if reproducible in staging) +4. **Monitoring**: Grafana dashboards for trends + +--- + +## Related Documentation + +- **Patterns**: [memory-optimization-patterns.md](memory-optimization-patterns.md) +- **GC**: [garbage-collection-guide.md](garbage-collection-guide.md) +- **Examples**: [Examples Index](../examples/INDEX.md) + +--- + +Return to [reference index](INDEX.md) diff --git a/skills/memory-profiling/templates/INDEX.md b/skills/memory-profiling/templates/INDEX.md new file mode 100644 index 0000000..abf3cc0 --- /dev/null +++ b/skills/memory-profiling/templates/INDEX.md @@ -0,0 +1,60 @@ +# Memory Profiler Templates + +Ready-to-use templates for memory profiling reports and heap snapshot analysis. + +## Templates Overview + +### Memory Investigation Report + +**File**: [memory-report-template.md](memory-report-template.md) + +Template for documenting memory leak investigations: +- **Incident Summary**: Timeline, symptoms, impact +- **Investigation Steps**: Tools used, findings +- **Root Cause**: Code analysis, leak pattern identified +- **Fix Implementation**: Code changes, validation +- **Results**: Before/after metrics + +**Use when**: Documenting memory leak investigations for team/postmortems + +--- + +### Heap Snapshot Analysis Checklist + +**File**: [heap-snapshot-analysis.md](heap-snapshot-analysis.md) + +Step-by-step checklist for analyzing V8 heap snapshots: +- **Snapshot Collection**: When/how to capture snapshots +- **Comparison Analysis**: Finding leaks by comparing snapshots +- **Retainer Analysis**: Understanding why objects not GC'd +- **Common Patterns**: EventEmitter, closures, timers + +**Use when**: Analyzing heap snapshots in Chrome DevTools + +--- + +## Quick Usage + +### Memory Report + +1. Copy template: `cp templates/memory-report-template.md docs/investigations/memory-leak-YYYY-MM-DD.md` +2. Fill in sections as you investigate +3. Share with team for review + +### Heap Analysis + +1. Open template: `templates/heap-snapshot-analysis.md` +2. Follow checklist step-by-step +3. Document findings in memory report + +--- + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Full investigation examples +- **Reference**: [Reference Index](../reference/INDEX.md) - Pattern catalog +- **Main Agent**: [memory-profiler.md](../memory-profiler.md) - Memory profiler agent + +--- + +Return to [main agent](../memory-profiler.md) diff --git a/skills/memory-profiling/templates/memory-report-template.md b/skills/memory-profiling/templates/memory-report-template.md new file mode 100644 index 0000000..32e87fe --- /dev/null +++ b/skills/memory-profiling/templates/memory-report-template.md @@ -0,0 +1,322 @@ +# Memory Leak Investigation Report + +**Service**: [Service Name] +**Date**: [YYYY-MM-DD] +**Investigator**: [Your Name] +**Severity**: [Critical/High/Medium/Low] + +--- + +## Executive Summary + +**TL;DR**: [One sentence summary of the leak, cause, and fix] + +**Impact**: +- Memory growth: [X MB/hour or X% increase] +- OOM incidents: [Number of crashes] +- Affected users: [Number or percentage] +- Duration: [How long the leak existed] + +**Resolution**: +- Root cause: [Leak pattern - e.g., "EventEmitter listeners not removed"] +- Fix deployed: [Date/time] +- Status: [Resolved/Monitoring/In Progress] + +--- + +## Incident Timeline + +| Time | Event | Details | +|------|-------|---------| +| [HH:MM] | Detection | [How was leak detected? Alert, manual observation, etc.] | +| [HH:MM] | Investigation started | [Initial actions taken] | +| [HH:MM] | Root cause identified | [What was found] | +| [HH:MM] | Fix implemented | [Code changes made] | +| [HH:MM] | Fix deployed | [Deployment details] | +| [HH:MM] | Validation complete | [Confirmation that leak is fixed] | + +--- + +## Symptoms and Detection + +### Initial Symptoms + +- [ ] Linear memory growth (X MB/hour) +- [ ] OOM crashes (frequency: ___) +- [ ] GC pressure (frequent/long pauses) +- [ ] Connection pool exhaustion +- [ ] Service degradation (slow responses) +- [ ] Other: ___ + +### Detection Method + +**How Discovered**: [Alert, monitoring dashboard, user report, etc.] + +**Monitoring Data**: +``` +Prometheus query: [Query used to detect the leak] +Alert rule: [Alert name/threshold] +Dashboard: [Link to Grafana dashboard] +``` + +**Example Metrics**: +``` +Before: +- Heap usage baseline: X MB +- After 6 hours: Y MB +- Growth rate: Z MB/hour + +Current: +- Heap usage: [Current value] +- Active connections: [Number] +- GC pause duration: [p95 value] +``` + +--- + +## Investigation Steps + +### 1. Initial Data Collection + +**Tools Used**: +- [ ] Chrome DevTools heap snapshots +- [ ] Node.js `--trace-gc` logs +- [ ] Python Scalene profiling +- [ ] Prometheus metrics +- [ ] Application logs +- [ ] Other: ___ + +**Heap Snapshots Collected**: +``` +Snapshot 1: [timestamp] - [size] MB - [location/filename] +Snapshot 2: [timestamp] - [size] MB - [location/filename] +Snapshot 3: [timestamp] - [size] MB - [location/filename] +``` + +### 2. Snapshot Comparison Analysis + +**Method**: [Comparison view in Chrome DevTools, diff analysis, etc.] + +**Findings**: +``` +Objects growing between snapshots: +- [Object type 1]: +X instances (+Y MB) +- [Object type 2]: +X instances (+Y MB) +- [Object type 3]: +X instances (+Y MB) + +Top 3 memory consumers: +1. [Object type] - X MB - [Retainer path] +2. [Object type] - X MB - [Retainer path] +3. [Object type] - X MB - [Retainer path] +``` + +### 3. Retainer Path Analysis + +**Leaked Object**: [Type of object that's leaking] + +**Retainer Path**: +``` +Window / Global + → [Variable name] + → [Object/function] + → [Property] + → [Leaked object] +``` + +**Why Not GC'd**: [Explanation of what's keeping object alive] + +--- + +## Root Cause Analysis + +### Leak Pattern Identified + +**Pattern**: [e.g., EventEmitter leak, closure trap, unclosed connection, etc.] + +**Vulnerable Code** (before fix): +```typescript +// File: [filepath]:[line] +// [Brief explanation of why this leaks] + +[Paste vulnerable code here] +``` + +**Why This Leaks**: +1. [Step 1 of how the leak occurs] +2. [Step 2] +3. [Result: memory accumulates] + +### Reproduction Steps + +1. [Step to reproduce leak in dev/staging] +2. [Step 2] +3. [Observed result: memory growth] + +**Reproduction Time**: [How long to observe leak? Minutes/hours] + +--- + +## Fix Implementation + +### Code Changes + +**Pull Request**: [Link to PR] + +**Files Modified**: +- [file1.ts] - [Brief description of change] +- [file2.ts] - [Brief description of change] + +**Fixed Code**: +```typescript +// File: [filepath]:[line] +// [Brief explanation of fix] + +[Paste fixed code here] +``` + +**Fix Strategy**: +- [ ] Remove event listeners (use `removeListener()` or `once()`) +- [ ] Close connections (use context managers or `try/finally`) +- [ ] Clear timers (use `clearInterval()`/`clearTimeout()`) +- [ ] Use WeakMap/WeakSet (for cache) +- [ ] Implement generator/streaming (for large datasets) +- [ ] Other: ___ + +### Testing and Validation + +**Tests Added**: +```typescript +// Test that verifies no leak +describe('Memory leak fix', () => { + it('should not leak listeners', () => { + const before = emitter.listenerCount('event'); + // ... execute code + const after = emitter.listenerCount('event'); + expect(after).toBe(before); // No leak + }); +}); +``` + +**Load Test Results**: +``` +Before fix: +- Memory after 1000 requests: X MB +- Memory after 10000 requests: Y MB (growth) + +After fix: +- Memory after 1000 requests: X MB +- Memory after 10000 requests: X MB (stable) +``` + +--- + +## Deployment and Results + +### Deployment Details + +**Environment**: [staging/production] +**Deployment Time**: [YYYY-MM-DD HH:MM UTC] +**Rollout Strategy**: [Canary, blue-green, rolling, etc.] + +### Post-Deployment Metrics + +**Before Fix**: +``` +Memory baseline: X MB +Memory after 6h: Y MB +Growth rate: Z MB/hour +OOM incidents: N/week +``` + +**After Fix**: +``` +Memory baseline: X MB +Memory after 6h: X MB (stable!) +Growth rate: 0 MB/hour +OOM incidents: 0/month +``` + +**Improvement**: +- Memory reduction: [X% or Y MB] +- OOM elimination: [100%] +- GC pressure: [Reduced by X%] + +### Grafana Dashboard + +**Link**: [Dashboard URL] + +**Key Panels**: +- Heap usage trend: [Shows memory stable after fix] +- GC pause duration: [Shows improved GC behavior] +- Error rate: [Shows OOM errors eliminated] + +--- + +## Lessons Learned + +### What Went Well + +- [Positive aspect 1] +- [Positive aspect 2] + +### What Could Be Improved + +- [Improvement area 1] +- [Improvement area 2] + +### Preventive Measures + +**Monitoring Added**: +- [ ] Alert: Memory growth >X MB/hour for >Y hours +- [ ] Alert: Heap usage >Z% of limit +- [ ] Dashboard: Memory trend visualization +- [ ] Alert: Connection pool saturation >X% + +**Code Review Checklist Updated**: +- [ ] Event listeners properly cleaned up +- [ ] Database connections closed +- [ ] Timers/intervals cleared +- [ ] Large datasets processed with streaming/chunking + +**Testing Standards**: +- [ ] Memory leak tests for event listeners +- [ ] Load tests with memory monitoring +- [ ] CI/CD checks for connection cleanup + +--- + +## Related Documentation + +- **Pattern Catalog**: [Link to memory-optimization-patterns.md] +- **Similar Incidents**: [Links to previous memory leak reports] +- **Runbook**: [Link to memory leak runbook] + +--- + +## Appendix + +### Heap Snapshot Files + +- [snapshot1.heapsnapshot] - [Location/S3 URL] +- [snapshot2.heapsnapshot] - [Location/S3 URL] + +### GC Logs + +``` +[Relevant GC log excerpts showing the leak] +``` + +### Prometheus Queries + +```promql +# Memory growth rate +rate(nodejs_heap_used_bytes[1h]) + +# GC pause duration +histogram_quantile(0.95, rate(nodejs_gc_duration_seconds_bucket[5m])) +``` + +--- + +**Report Completed**: [YYYY-MM-DD] +**Next Review**: [Date for follow-up validation] diff --git a/skills/observability-engineering/SKILL.md b/skills/observability-engineering/SKILL.md new file mode 100644 index 0000000..cc664b4 --- /dev/null +++ b/skills/observability-engineering/SKILL.md @@ -0,0 +1,26 @@ +# Observability Engineering Skill + +Production-ready monitoring, logging, and tracing using Prometheus, Grafana, OpenTelemetry, DataDog, and Sentry. + +## Description + +Comprehensive observability setup including SLO implementation, distributed tracing, dashboards, and incident prevention. + +## What's Included + +- **Examples**: Prometheus configs, Grafana dashboards, SLO definitions +- **Reference**: Observability best practices, monitoring strategies +- **Templates**: Dashboard templates, alert rules + +## Use When + +- Setting up production monitoring +- Implementing SLOs +- Distributed tracing +- Performance tracking + +## Related Agents + +- `observability-engineer` + +**Skill Version**: 1.0 diff --git a/skills/observability-engineering/checklists/observability-setup-checklist.md b/skills/observability-engineering/checklists/observability-setup-checklist.md new file mode 100644 index 0000000..8acd35b --- /dev/null +++ b/skills/observability-engineering/checklists/observability-setup-checklist.md @@ -0,0 +1,600 @@ +# Observability Engineering Setup Checklist + +Comprehensive checklist for implementing production-grade observability with logs, metrics, traces, and alerts. + +## Pre-Implementation Planning + +- [ ] **Define observability goals** (debug production issues, monitor SLAs, detect anomalies) +- [ ] **Choose observability stack**: + - [ ] Logging: Pino (Node.js), structlog (Python), CloudWatch, Datadog + - [ ] Metrics: Prometheus, Datadog, CloudWatch + - [ ] Tracing: OpenTelemetry, Datadog APM, Jaeger + - [ ] Visualization: Grafana, Datadog, Honeycomb + +- [ ] **Set up observability infrastructure** (collectors, storage, dashboards) +- [ ] **Define data retention** policies (logs: 30 days, metrics: 1 year, traces: 7 days) +- [ ] **Plan for scale** (log volume, metric cardinality, trace sampling) + +## Structured Logging + +### Logger Configuration + +- [ ] **Structured logger installed**: + - Node.js: `pino` with `pino-pretty` for dev + - Python: `structlog` with JSON formatter + - Browser: Custom JSON logger or service integration + +- [ ] **Log levels defined**: + - [ ] TRACE: Very detailed debugging + - [ ] DEBUG: Detailed debugging info + - [ ] INFO: General informational messages + - [ ] WARN: Warning messages (recoverable issues) + - [ ] ERROR: Error messages (failures) + - [ ] FATAL: Critical failures (application crash) + +- [ ] **Environment-based configuration**: + - [ ] Development: Pretty-printed logs, DEBUG level + - [ ] Production: JSON logs, INFO level + - [ ] Test: Silent or minimal logs + +### Log Structure + +- [ ] **Standard log format** across services: + ```json + { + "level": "info", + "timestamp": "2025-11-10T10:30:00.000Z", + "service": "api-server", + "environment": "production", + "tenant_id": "uuid", + "user_id": "uuid", + "request_id": "uuid", + "message": "User logged in", + "duration_ms": 150, + "http": { + "method": "POST", + "path": "/api/login", + "status": 200, + "user_agent": "Mozilla/5.0..." + } + } + ``` + +- [ ] **Correlation IDs** added: + - [ ] request_id: Unique per request + - [ ] session_id: Unique per session + - [ ] trace_id: Unique per distributed trace + - [ ] tenant_id: Multi-tenant context + +- [ ] **Context propagation** through request lifecycle +- [ ] **Sensitive data redacted** (passwords, tokens, credit cards) + +### What to Log + +- [ ] **Request/response logging**: + - [ ] HTTP method, path, status code + - [ ] Request duration + - [ ] User agent, IP address (hashed or anonymized) + - [ ] Query parameters (non-sensitive) + +- [ ] **Authentication events**: + - [ ] Login success/failure + - [ ] Logout + - [ ] Token refresh + - [ ] Permission checks + +- [ ] **Business events**: + - [ ] User registration + - [ ] Payment processing + - [ ] Data exports + - [ ] Admin actions + +- [ ] **Errors and exceptions**: + - [ ] Error message + - [ ] Stack trace + - [ ] Error context (what user was doing) + - [ ] Affected resources (user_id, tenant_id, entity_id) + +- [ ] **Performance metrics**: + - [ ] Database query times + - [ ] External API call times + - [ ] Cache hit/miss rates + - [ ] Background job durations + +### Log Aggregation + +- [ ] **Logs shipped** to central location: + - [ ] CloudWatch Logs + - [ ] Datadog Logs + - [ ] Elasticsearch (ELK stack) + - [ ] Splunk + +- [ ] **Log retention** configured (30-90 days typical) +- [ ] **Log volume** monitored (cost management) +- [ ] **Log sampling** for high-volume services (if needed) + +## Application Metrics + +### Metric Types + +- [ ] **Counters** for events that only increase: + - [ ] Total requests + - [ ] Total errors + - [ ] Total registrations + +- [ ] **Gauges** for values that go up and down: + - [ ] Active connections + - [ ] Memory usage + - [ ] Queue depth + +- [ ] **Histograms** for distributions: + - [ ] Request duration + - [ ] Response size + - [ ] Database query time + +- [ ] **Summaries** for quantiles (p50, p95, p99) + +### Standard Metrics + +#### HTTP Metrics + +- [ ] **http_requests_total** (counter): + - Labels: method, path, status, tenant_id + - Track total requests per endpoint + +- [ ] **http_request_duration_seconds** (histogram): + - Labels: method, path, status + - Buckets: 0.1, 0.5, 1, 2, 5, 10 seconds + +- [ ] **http_request_size_bytes** (histogram) +- [ ] **http_response_size_bytes** (histogram) + +#### Database Metrics + +- [ ] **db_queries_total** (counter): + - Labels: operation (SELECT, INSERT, UPDATE, DELETE), table + +- [ ] **db_query_duration_seconds** (histogram): + - Labels: operation, table + - Track slow queries (p95, p99) + +- [ ] **db_connection_pool_size** (gauge) +- [ ] **db_connection_pool_available** (gauge) + +#### Application Metrics + +- [ ] **active_sessions** (gauge) +- [ ] **background_jobs_total** (counter): + - Labels: job_name, status (success, failure) + +- [ ] **background_job_duration_seconds** (histogram): + - Labels: job_name + +- [ ] **cache_operations_total** (counter): + - Labels: operation (hit, miss, set, delete) + +- [ ] **external_api_calls_total** (counter): + - Labels: service, status + +- [ ] **external_api_duration_seconds** (histogram): + - Labels: service + +#### System Metrics + +- [ ] **process_cpu_usage_percent** (gauge) +- [ ] **process_memory_usage_bytes** (gauge) +- [ ] **process_heap_usage_bytes** (gauge) - JavaScript specific +- [ ] **process_open_file_descriptors** (gauge) + +### Metric Collection + +- [ ] **Prometheus client library** installed: + - Node.js: `prom-client` + - Python: `prometheus-client` + - Custom: OpenTelemetry SDK + +- [ ] **Metrics endpoint** exposed (`/metrics`) +- [ ] **Prometheus scrapes** endpoint (or push to gateway) +- [ ] **Metric naming** follows conventions: + - Lowercase with underscores + - Unit suffixes (_seconds, _bytes, _total) + - Namespace prefix (myapp_http_requests_total) + +### Multi-Tenant Metrics + +- [ ] **tenant_id label** on all relevant metrics +- [ ] **Per-tenant dashboards** (filter by tenant_id) +- [ ] **Tenant resource usage** tracked: + - [ ] API calls per tenant + - [ ] Database storage per tenant + - [ ] Data transfer per tenant + +- [ ] **Tenant quotas** monitored (alert on approaching limit) + +## Distributed Tracing + +### Tracing Setup + +- [ ] **OpenTelemetry SDK** installed: + - Node.js: `@opentelemetry/sdk-node` + - Python: `opentelemetry-sdk` + +- [ ] **Tracing backend** configured: + - [ ] Jaeger (self-hosted) + - [ ] Datadog APM + - [ ] Honeycomb + - [ ] AWS X-Ray + +- [ ] **Auto-instrumentation** enabled: + - [ ] HTTP client/server + - [ ] Database queries + - [ ] Redis operations + - [ ] Message queues + +### Span Creation + +- [ ] **Custom spans** for business logic: + ```typescript + const span = tracer.startSpan('process-payment'); + span.setAttribute('tenant_id', tenantId); + span.setAttribute('amount', amount); + try { + await processPayment(); + span.setStatus({ code: SpanStatusCode.OK }); + } catch (error) { + span.recordException(error); + span.setStatus({ code: SpanStatusCode.ERROR }); + throw error; + } finally { + span.end(); + } + ``` + +- [ ] **Span attributes** include context: + - [ ] tenant_id, user_id, request_id + - [ ] Input parameters (non-sensitive) + - [ ] Result status + +- [ ] **Span events** for key moments: + - [ ] "Payment started" + - [ ] "Database query executed" + - [ ] "External API called" + +### Trace Context Propagation + +- [ ] **W3C Trace Context** headers propagated: + - traceparent: trace-id, parent-span-id, flags + - tracestate: vendor-specific data + +- [ ] **Context propagated** across: + - [ ] HTTP requests (frontend ↔ backend) + - [ ] Background jobs + - [ ] Message queues + - [ ] Microservices + +- [ ] **Trace ID** included in logs (correlate logs + traces) + +### Sampling + +- [ ] **Sampling strategy** defined: + - [ ] Head-based: Sample at trace start (1%, 10%, 100%) + - [ ] Tail-based: Sample after trace completes (error traces, slow traces) + - [ ] Adaptive: Sample based on load + +- [ ] **Always sample** errors and slow requests +- [ ] **Sample rate** appropriate for volume (start high, reduce if needed) + +## Alerting + +### Alert Definitions + +- [ ] **Error rate alerts**: + - [ ] Condition: Error rate > 5% for 5 minutes + - [ ] Severity: Critical + - [ ] Action: Page on-call engineer + +- [ ] **Latency alerts**: + - [ ] Condition: p95 latency > 1s for 10 minutes + - [ ] Severity: Warning + - [ ] Action: Slack notification + +- [ ] **Availability alerts**: + - [ ] Condition: Health check fails 3 consecutive times + - [ ] Severity: Critical + - [ ] Action: Page on-call + auto-restart + +- [ ] **Resource alerts**: + - [ ] Memory usage > 80% + - [ ] CPU usage > 80% + - [ ] Disk usage > 85% + - [ ] Database connections > 90% of pool + +- [ ] **Business metric alerts**: + - [ ] Registration rate drops > 50% + - [ ] Payment failures increase > 10% + - [ ] Active users drop significantly + +### Alert Channels + +- [ ] **PagerDuty** (or equivalent) for critical alerts +- [ ] **Slack** for warnings and notifications +- [ ] **Email** for non-urgent alerts +- [ ] **SMS** for highest priority (only use sparingly) + +### Alert Management + +- [ ] **Alert fatigue** prevented: + - [ ] Appropriate thresholds (not too sensitive) + - [ ] Proper severity levels (not everything is critical) + - [ ] Alert aggregation (deduplicate similar alerts) + +- [ ] **Runbooks** for each alert: + - [ ] What the alert means + - [ ] How to investigate + - [ ] How to resolve + - [ ] Escalation path + +- [ ] **Alert suppression** during deployments (planned downtime) +- [ ] **Alert escalation** if not acknowledged + +## Dashboards & Visualization + +### Standard Dashboards + +- [ ] **Service Overview** dashboard: + - [ ] Request rate (requests/sec) + - [ ] Error rate (errors/sec, %) + - [ ] Latency (p50, p95, p99) + - [ ] Availability (uptime %) + +- [ ] **Database** dashboard: + - [ ] Query rate + - [ ] Slow queries (p95, p99) + - [ ] Connection pool usage + - [ ] Table sizes + +- [ ] **System Resources** dashboard: + - [ ] CPU usage + - [ ] Memory usage + - [ ] Disk I/O + - [ ] Network I/O + +- [ ] **Business Metrics** dashboard: + - [ ] Active users + - [ ] Registrations + - [ ] Revenue + - [ ] Feature usage + +### Dashboard Best Practices + +- [ ] **Auto-refresh** enabled (every 30-60 seconds) +- [ ] **Time range** configurable (last hour, 24h, 7 days) +- [ ] **Drill-down** to detailed views +- [ ] **Annotations** for deployments/incidents +- [ ] **Shared dashboards** accessible to team + +### Per-Tenant Dashboards + +- [ ] **Tenant filter** on all relevant dashboards +- [ ] **Tenant resource usage** visualized +- [ ] **Tenant-specific alerts** (if large customer) +- [ ] **Tenant comparison** view (compare usage across tenants) + +## Health Checks + +### Endpoint Implementation + +- [ ] **Health check endpoint** (`/health` or `/healthz`): + - [ ] Returns 200 OK when healthy + - [ ] Returns 503 Service Unavailable when unhealthy + - [ ] Includes subsystem status + +```json +{ + "status": "healthy", + "version": "1.2.3", + "uptime_seconds": 86400, + "checks": { + "database": "healthy", + "redis": "healthy", + "external_api": "degraded" + } +} +``` + +- [ ] **Liveness probe** (`/health/live`): + - [ ] Checks if application is running + - [ ] Fails → restart container + +- [ ] **Readiness probe** (`/health/ready`): + - [ ] Checks if application is ready to serve traffic + - [ ] Fails → remove from load balancer + +### Health Check Coverage + +- [ ] **Database connectivity** checked +- [ ] **Cache connectivity** checked (Redis, Memcached) +- [ ] **External APIs** checked (optional, can cause false positives) +- [ ] **Disk space** checked +- [ ] **Critical dependencies** checked + +### Monitoring Health Checks + +- [ ] **Uptime monitoring** service (Pingdom, UptimeRobot, Datadog Synthetics) +- [ ] **Check frequency** appropriate (every 1-5 minutes) +- [ ] **Alerting** on failed health checks +- [ ] **Geographic monitoring** (check from multiple regions) + +## Error Tracking + +### Error Capture + +- [ ] **Error tracking service** integrated: + - [ ] Sentry + - [ ] Datadog Error Tracking + - [ ] Rollbar + - [ ] Custom solution + +- [ ] **Unhandled exceptions** captured automatically +- [ ] **Handled errors** reported when appropriate +- [ ] **Error context** included: + - [ ] User ID, tenant ID + - [ ] Request ID, trace ID + - [ ] User actions (breadcrumbs) + - [ ] Environment variables (non-sensitive) + +### Error Grouping + +- [ ] **Errors grouped** by fingerprint (same error, different occurrences) +- [ ] **Error rate** tracked per group +- [ ] **Alerting** on new error types or spike in existing +- [ ] **Error assignment** to team members +- [ ] **Resolution tracking** (mark errors as resolved) + +### Privacy & Security + +- [ ] **PII redacted** from error reports: + - [ ] Passwords, tokens, API keys + - [ ] Credit card numbers + - [ ] Email addresses (unless necessary) + - [ ] SSNs, tax IDs + +- [ ] **Source maps** uploaded for frontend (de-minify stack traces) +- [ ] **Release tagging** (associate errors with deployments) + +## Performance Monitoring + +### Real User Monitoring (RUM) + +- [ ] **RUM tool integrated** (Datadog RUM, New Relic Browser, Google Analytics): + - [ ] Page load times + - [ ] Core Web Vitals (LCP, FID, CLS) + - [ ] JavaScript errors + - [ ] User sessions + +- [ ] **Performance budgets** defined: + - [ ] First Contentful Paint < 1.8s + - [ ] Largest Contentful Paint < 2.5s + - [ ] Time to Interactive < 3.8s + - [ ] Cumulative Layout Shift < 0.1 + +- [ ] **Alerting** on performance regressions + +### Application Performance Monitoring (APM) + +- [ ] **APM tool** integrated (Datadog APM, New Relic APM): + - [ ] Trace every request + - [ ] Identify slow endpoints + - [ ] Database query analysis + - [ ] External API profiling + +- [ ] **Performance profiling** for critical paths: + - [ ] Authentication flow + - [ ] Payment processing + - [ ] Data exports + - [ ] Complex queries + +## Cost Management + +- [ ] **Observability costs** tracked: + - [ ] Log ingestion costs + - [ ] Metric cardinality costs + - [ ] Trace sampling costs + - [ ] Dashboard/seat costs + +- [ ] **Cost optimization**: + - [ ] Log sampling for high-volume services + - [ ] Metric aggregation (reduce cardinality) + - [ ] Trace sampling (not 100% in production) + - [ ] Data retention policies + +- [ ] **Budget alerts** configured + +## Security & Compliance + +- [ ] **Access control** on observability tools (role-based) +- [ ] **Audit logging** for observability access +- [ ] **Data retention** complies with regulations (GDPR, HIPAA) +- [ ] **Data encryption** in transit and at rest +- [ ] **PII handling** compliant (redaction, anonymization) + +## Testing Observability + +- [ ] **Log output** tested in unit tests: + ```typescript + test('logs user login', () => { + const logs = captureLogs(); + await loginUser(); + expect(logs).toContainEqual( + expect.objectContaining({ + level: 'info', + message: 'User logged in', + user_id: expect.any(String) + }) + ); + }); + ``` + +- [ ] **Metrics** incremented in tests +- [ ] **Traces** created in integration tests +- [ ] **Health checks** tested +- [ ] **Alert thresholds** tested (inject failures, verify alert fires) + +## Documentation + +- [ ] **Observability runbook** created: + - [ ] How to access logs, metrics, traces + - [ ] How to create dashboards + - [ ] How to set up alerts + - [ ] Common troubleshooting queries + +- [ ] **Alert runbooks** for each alert +- [ ] **Dashboard documentation** (what each panel shows) +- [ ] **Metric dictionary** (what each metric means) +- [ ] **On-call procedures** documented + +## Scoring + +- **85+ items checked**: Excellent - Production-grade observability ✅ +- **65-84 items**: Good - Most observability covered ⚠️ +- **45-64 items**: Fair - Significant gaps exist 🔴 +- **<45 items**: Poor - Not ready for production ❌ + +## Priority Items + +Address these first: +1. **Structured logging** - Foundation for debugging +2. **Error tracking** - Catch and fix bugs quickly +3. **Health checks** - Know when service is down +4. **Alerting** - Get notified of issues +5. **Key metrics** - Request rate, error rate, latency + +## Common Pitfalls + +❌ **Don't:** +- Log sensitive data (passwords, tokens, PII) +- Create high-cardinality metrics (user_id as label) +- Trace 100% of requests in production (sample instead) +- Alert on every anomaly (alert fatigue) +- Ignore observability until there's a problem + +✅ **Do:** +- Log at appropriate levels (use DEBUG for verbose) +- Use correlation IDs throughout request lifecycle +- Set up alerts with clear runbooks +- Review dashboards regularly (detect issues early) +- Iterate on observability (improve over time) + +## Related Resources + +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) +- [Pino Logger](https://getpino.io) +- [Prometheus Best Practices](https://prometheus.io/docs/practices/) +- [observability-engineering skill](../SKILL.md) + +--- + +**Total Items**: 140+ observability checks +**Critical Items**: Logging, Metrics, Alerting, Health checks +**Coverage**: Logs, Metrics, Traces, Alerts, Dashboards +**Last Updated**: 2025-11-10 diff --git a/skills/observability-engineering/examples/INDEX.md b/skills/observability-engineering/examples/INDEX.md new file mode 100644 index 0000000..1dd5cc6 --- /dev/null +++ b/skills/observability-engineering/examples/INDEX.md @@ -0,0 +1,136 @@ +# Observability Examples + +Production-ready observability implementations for Grey Haven stack (Cloudflare Workers, TanStack Start, FastAPI, PostgreSQL). + +## Examples Overview + +### Prometheus + Grafana Setup + +**File**: [prometheus-grafana-setup.md](prometheus-grafana-setup.md) + +Complete monitoring stack for Kubernetes with Golden Signals: +- **Prometheus Deployment** - Helm charts, service discovery, scrape configs +- **Grafana Setup** - Dashboard-as-code, templating, alerting +- **Node Exporter** - System metrics collection (CPU, memory, disk) +- **kube-state-metrics** - Kubernetes resource metrics +- **Golden Signals** - Request rate, error rate, latency (p50/p95/p99), saturation +- **Recording Rules** - Pre-aggregated metrics for fast queries +- **Alert Manager** - PagerDuty integration, escalation policies +- **Before/After Metrics** - Response time improved 40%, MTTR reduced 60% + +**Use when**: Setting up production monitoring, implementing SRE practices, cloud-native deployments + +--- + +### OpenTelemetry Distributed Tracing + +**File**: [opentelemetry-tracing.md](opentelemetry-tracing.md) + +Distributed tracing for microservices with Jaeger: +- **OTel Collector** - Receiver/processor/exporter pipelines +- **Auto-Instrumentation** - Zero-code tracing for Node.js, Python, FastAPI +- **Context Propagation** - W3C Trace Context across services +- **Sampling Strategies** - Head-based (10%), tail-based (errors only) +- **Span Attributes** - HTTP method, status code, user ID, tenant ID +- **Trace Visualization** - Jaeger UI, dependency graphs, critical path +- **Performance Impact** - <5ms overhead, 2% CPU increase +- **Before/After** - MTTR 45min → 8min (82% reduction) + +**Use when**: Debugging microservices, understanding latency, optimizing critical paths + +--- + +### SLO & Error Budget Framework + +**File**: [slo-error-budgets.md](slo-error-budgets.md) + +Complete SLI/SLO/Error Budget implementation: +- **SLI Definition** - Availability (99.9%), latency (p95 < 200ms), error rate (< 0.5%) +- **SLO Targets** - Critical (99.95%), Essential (99.9%), Standard (99.5%) +- **Error Budget Calculation** - Monthly budget, burn rate monitoring (1h/6h/24h windows) +- **Prometheus Recording Rules** - Multi-window SLI calculations +- **Grafana SLO Dashboard** - Real-time status, budget remaining, burn rate graphs +- **Budget Policies** - Feature freeze at 25% remaining, postmortem required at depletion +- **Burn Rate Alerts** - PagerDuty escalation when burning too fast +- **Impact** - 99.95% availability achieved, 3 feature freezes prevented overspend + +**Use when**: Implementing SRE practices, balancing reliability with velocity, production deployments + +--- + +### DataDog APM Integration + +**File**: [datadog-apm.md](datadog-apm.md) + +Application Performance Monitoring for Grey Haven stack: +- **DataDog Agent** - Cloudflare Workers instrumentation, FastAPI tracing +- **Custom Metrics** - Business KPIs (checkout success rate, revenue per minute) +- **Real User Monitoring (RUM)** - Frontend performance, user sessions, error tracking +- **APM Traces** - Distributed tracing with Cloudflare Workers, database queries +- **Log Correlation** - Trace ID in logs, unified troubleshooting +- **Synthetic Monitoring** - API health checks every 1 minute from 10 locations +- **Anomaly Detection** - ML-powered alerts for unusual patterns +- **Cost** - $31/host/month, $40/million spans +- **Before/After** - 99.5% → 99.95% availability (10x fewer incidents) + +**Use when**: Commercial APM needed, executive dashboards required, startup budget allows + +--- + +### Centralized Logging with Fluentd + Elasticsearch + +**File**: [centralized-logging.md](centralized-logging.md) + +Production log aggregation for multi-region deployments: +- **Fluentd DaemonSet** - Kubernetes log collection from all pods +- **Structured Logging** - JSON format with trace ID, user ID, tenant ID +- **Elasticsearch Indexing** - Daily indices with rollover, ILM policies +- **Kibana Dashboards** - Error tracking, request patterns, audit logs +- **Log Parsing** - Grok patterns for FastAPI, TanStack Start, PostgreSQL +- **Retention** - Hot (7 days), Warm (30 days), Cold (90 days), Archive (1 year) +- **PII Redaction** - Automatic SSN, credit card, email masking +- **Volume** - 500GB/day ingested, 90% compression, $800/month cost +- **Before/After** - Log search 5min → 10sec, disk usage 10TB → 1TB + +**Use when**: Debugging production issues, compliance requirements (SOC2/PCI), audit trails + +--- + +### Chaos Engineering with Gremlin + +**File**: [chaos-engineering.md](chaos-engineering.md) + +Reliability testing and circuit breaker validation: +- **Gremlin Setup** - Agent deployment, blast radius configuration +- **Chaos Experiments** - Pod termination, network latency (100ms), CPU stress (80%) +- **Circuit Breaker** - Automatic fallback when error rate > 50% +- **Hypothesis** - "API handles 50% pod failures without user impact" +- **Validation** - Prometheus metrics, distributed traces, user session monitoring +- **Results** - Circuit breaker engaged in 2sec, fallback success rate 99.8% +- **Runbook** - Automatic rollback triggers, escalation procedures +- **Impact** - Found 3 critical bugs before production, confidence in resilience + +**Use when**: Pre-production validation, testing disaster recovery, chaos engineering practice + +--- + +## Quick Navigation + +| Topic | File | Lines | Focus | +|-------|------|-------|-------| +| **Prometheus + Grafana** | [prometheus-grafana-setup.md](prometheus-grafana-setup.md) | ~480 | Golden Signals monitoring | +| **OpenTelemetry** | [opentelemetry-tracing.md](opentelemetry-tracing.md) | ~450 | Distributed tracing | +| **SLO Framework** | [slo-error-budgets.md](slo-error-budgets.md) | ~420 | Error budget management | +| **DataDog APM** | [datadog-apm.md](datadog-apm.md) | ~400 | Commercial APM | +| **Centralized Logging** | [centralized-logging.md](centralized-logging.md) | ~440 | Log aggregation | +| **Chaos Engineering** | [chaos-engineering.md](chaos-engineering.md) | ~350 | Reliability testing | + +## Related Documentation + +- **Reference**: [Reference Index](../reference/INDEX.md) - PromQL, Golden Signals, SLO best practices +- **Templates**: [Templates Index](../templates/INDEX.md) - Grafana dashboards, SLO definitions +- **Main Agent**: [observability-engineer.md](../observability-engineer.md) - Observability agent + +--- + +Return to [main agent](../observability-engineer.md) diff --git a/skills/observability-engineering/reference/INDEX.md b/skills/observability-engineering/reference/INDEX.md new file mode 100644 index 0000000..6c76adb --- /dev/null +++ b/skills/observability-engineering/reference/INDEX.md @@ -0,0 +1,67 @@ +# Observability Reference Documentation + +Comprehensive reference guides for production observability patterns, PromQL queries, and SRE best practices. + +## Reference Overview + +### PromQL Query Language Guide + +**File**: [promql-guide.md](promql-guide.md) + +Complete PromQL reference for Prometheus queries: +- **Metric types**: Counter, Gauge, Histogram, Summary +- **PromQL functions**: rate(), irate(), increase(), sum(), avg(), histogram_quantile() +- **Recording rules**: Pre-aggregated metrics for performance +- **Alerting queries**: Burn rate calculations, threshold alerts +- **Performance tips**: Query optimization, avoiding cardinality explosions + +**Use when**: Writing Prometheus queries, creating recording rules, debugging slow queries + +--- + +### Golden Signals Reference + +**File**: [golden-signals.md](golden-signals.md) + +Google SRE Golden Signals implementation guide: +- **Request Rate (Traffic)**: RPS calculations, per-service breakdowns +- **Error Rate**: 5xx errors, client vs server errors, error budget impact +- **Latency (Duration)**: p50/p95/p99 percentiles, latency SLOs +- **Saturation**: CPU, memory, disk, connection pools + +**Use when**: Designing monitoring dashboards, implementing SLIs, understanding system health + +--- + +### SLO Best Practices + +**File**: [slo-best-practices.md](slo-best-practices.md) + +Google SRE SLO/SLI/Error Budget framework: +- **SLI selection**: Choosing meaningful indicators (availability, latency, throughput) +- **SLO targets**: Critical (99.95%), Essential (99.9%), Standard (99.5%) +- **Error budget policies**: Feature freeze thresholds, postmortem requirements +- **Multi-window burn rate alerts**: 1h, 6h, 24h windows +- **SLO review cadence**: Weekly reviews, quarterly adjustments + +**Use when**: Implementing SLO framework, setting reliability targets, balancing velocity with reliability + +--- + +## Quick Navigation + +| Topic | File | Lines | Focus | +|-------|------|-------|-------| +| **PromQL** | [promql-guide.md](promql-guide.md) | ~450 | Query language reference | +| **Golden Signals** | [golden-signals.md](golden-signals.md) | ~380 | Four signals implementation | +| **SLO Practices** | [slo-best-practices.md](slo-best-practices.md) | ~420 | Google SRE framework | + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Production implementations +- **Templates**: [Templates Index](../templates/INDEX.md) - Copy-paste configurations +- **Main Agent**: [observability-engineer.md](../observability-engineer.md) - Observability agent + +--- + +Return to [main agent](../observability-engineer.md) diff --git a/skills/observability-engineering/templates/INDEX.md b/skills/observability-engineering/templates/INDEX.md new file mode 100644 index 0000000..daafcc9 --- /dev/null +++ b/skills/observability-engineering/templates/INDEX.md @@ -0,0 +1,72 @@ +# Observability Templates + +Copy-paste ready configuration templates for Prometheus, Grafana, and OpenTelemetry. + +## Templates Overview + +### Grafana Dashboard Template + +**File**: [grafana-dashboard.json](grafana-dashboard.json) + +Production-ready Golden Signals dashboard: +- **Request Rate**: Total RPS with 5-minute averages +- **Error Rate**: Percentage of 5xx errors with alert thresholds +- **Latency**: p50/p95/p99 percentiles in milliseconds +- **Saturation**: CPU and memory usage percentages + +**Use when**: Creating new service dashboards, standardizing monitoring + +--- + +### SLO Definition Template + +**File**: [slo-definition.yaml](slo-definition.yaml) + +Service Level Objective configuration: +- **SLO tiers**: Critical (99.95%), Essential (99.9%), Standard (99.5%) +- **SLI definitions**: Availability, latency, error rate +- **Error budget policy**: Feature freeze thresholds +- **Multi-window burn rate alerts**: 1h, 6h, 24h windows + +**Use when**: Implementing SLO framework for new services + +--- + +### Prometheus Recording Rules + +**File**: [prometheus-recording-rules.yaml](prometheus-recording-rules.yaml) + +Pre-aggregated metrics for fast dashboards: +- **Request rates**: Per-service, per-endpoint RPS +- **Error rates**: Percentage calculations (5xx / total) +- **Latency percentiles**: p50/p95/p99 pre-computed +- **Error budget**: Remaining budget and burn rate + +**Use when**: Optimizing slow dashboard queries, implementing SLOs + +--- + +## Quick Usage + +```bash +# Copy template to your monitoring directory +cp templates/grafana-dashboard.json ../monitoring/dashboards/ + +# Edit service name and thresholds +vim ../monitoring/dashboards/grafana-dashboard.json + +# Import to Grafana +curl -X POST http://admin:password@localhost:3000/api/dashboards/db \ + -H "Content-Type: application/json" \ + -d @../monitoring/dashboards/grafana-dashboard.json +``` + +## Related Documentation + +- **Examples**: [Examples Index](../examples/INDEX.md) - Full implementations +- **Reference**: [Reference Index](../reference/INDEX.md) - PromQL, SLO guides +- **Main Agent**: [observability-engineer.md](../observability-engineer.md) - Observability agent + +--- + +Return to [main agent](../observability-engineer.md) diff --git a/skills/observability-engineering/templates/grafana-dashboard.json b/skills/observability-engineering/templates/grafana-dashboard.json new file mode 100644 index 0000000..9eb41a6 --- /dev/null +++ b/skills/observability-engineering/templates/grafana-dashboard.json @@ -0,0 +1,210 @@ +{ + "dashboard": { + "title": "Golden Signals - [Service Name]", + "tags": ["golden-signals", "production", "slo"], + "timezone": "UTC", + "refresh": "30s", + "time": { + "from": "now-6h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "title": "Request Rate (RPS)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))", + "legendFormat": "Total RPS", + "refId": "A" + }, + { + "expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m])) by (method)", + "legendFormat": "{{method}}", + "refId": "B" + } + ], + "yaxes": [ + {"format": "reqps", "label": "Requests/sec"}, + {"format": "short"} + ], + "legend": {"show": true, "alignAsTable": true, "avg": true, "max": true, "current": true} + }, + { + "id": 2, + "title": "Error Rate (%)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "targets": [ + { + "expr": "(sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))) * 100", + "legendFormat": "Error Rate %", + "refId": "A" + } + ], + "yaxes": [ + {"format": "percent", "label": "Error %", "max": 5}, + {"format": "short"} + ], + "alert": { + "name": "High Error Rate", + "conditions": [ + { + "evaluator": {"params": [1], "type": "gt"}, + "operator": {"type": "and"}, + "query": {"params": ["A", "5m", "now"]}, + "type": "query" + } + ], + "frequency": "1m", + "for": "5m", + "message": "Error rate > 1% for 5 minutes", + "noDataState": "no_data", + "notifications": [] + }, + "thresholds": [ + {"value": 1, "colorMode": "critical", "op": "gt", "fill": true, "line": true} + ] + }, + { + "id": 3, + "title": "Request Latency (p50/p95/p99)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000", + "legendFormat": "p99", + "refId": "C" + } + ], + "yaxes": [ + {"format": "ms", "label": "Latency (ms)"}, + {"format": "short"} + ], + "thresholds": [ + {"value": 200, "colorMode": "warning", "op": "gt"}, + {"value": 500, "colorMode": "critical", "op": "gt"} + ] + }, + { + "id": 4, + "title": "Resource Saturation (CPU/Memory %)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "targets": [ + { + "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "CPU %", + "refId": "A" + }, + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "legendFormat": "Memory %", + "refId": "B" + } + ], + "yaxes": [ + {"format": "percent", "label": "Usage %", "max": 100}, + {"format": "short"} + ], + "thresholds": [ + {"value": 80, "colorMode": "warning", "op": "gt"}, + {"value": 90, "colorMode": "critical", "op": "gt"} + ] + }, + { + "id": 5, + "title": "Top 10 Slowest Endpoints", + "type": "table", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "targets": [ + { + "expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le, path))) * 1000", + "legendFormat": "", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "transformations": [ + {"id": "organize", "options": {"excludeByName": {}, "indexByName": {}, "renameByName": {"path": "Endpoint", "Value": "p95 Latency (ms)"}}} + ] + }, + { + "id": 6, + "title": "SLO Status (30-day)", + "type": "stat", + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 16}, + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])) * 100", + "refId": "A" + } + ], + "options": { + "graphMode": "none", + "textMode": "value_and_name", + "colorMode": "background" + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 3, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": 0, "color": "red"}, + {"value": 99.5, "color": "yellow"}, + {"value": 99.9, "color": "green"} + ] + } + } + } + }, + { + "id": 7, + "title": "Error Budget Remaining", + "type": "gauge", + "gridPos": {"h": 8, "w": 6, "x": 18, "y": 16}, + "targets": [ + { + "expr": "(1 - ((1 - (sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])))) / (1 - 0.999))) * 100", + "refId": "A" + } + ], + "options": { + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": 0, "color": "red"}, + {"value": 25, "color": "yellow"}, + {"value": 50, "color": "green"} + ] + } + } + } + } + ] + } +} diff --git a/skills/observability-engineering/templates/prometheus-recording-rules.yaml b/skills/observability-engineering/templates/prometheus-recording-rules.yaml new file mode 100644 index 0000000..dc5f8e5 --- /dev/null +++ b/skills/observability-engineering/templates/prometheus-recording-rules.yaml @@ -0,0 +1,188 @@ +# Prometheus Recording Rules Template +# Pre-aggregated metrics for fast dashboard queries and SLO tracking +# Replace YOUR_SERVICE with actual service name + +groups: + # HTTP Request Rates + - name: http_request_rates + interval: 15s + rules: + # Total request rate (per-second) + - record: greyhaven:http_requests:rate5m + expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) + + # Request rate by service + - record: greyhaven:http_requests:rate5m:by_service + expr: sum(rate(http_requests_total[5m])) by (service) + + # Request rate by endpoint + - record: greyhaven:http_requests:rate5m:by_endpoint + expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint) + + # Request rate by method + - record: greyhaven:http_requests:rate5m:by_method + expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method) + + # Request rate by status code + - record: greyhaven:http_requests:rate5m:by_status + expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status) + + # HTTP Error Rates + - name: http_error_rates + interval: 15s + rules: + # Error rate (percentage) + - record: greyhaven:http_errors:rate5m + expr: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) + / + sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) + + # Error rate by service + - record: greyhaven:http_errors:rate5m:by_service + expr: | + sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) + / + sum(rate(http_requests_total[5m])) by (service) + + # Error rate by endpoint + - record: greyhaven:http_errors:rate5m:by_endpoint + expr: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint) + / + sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint) + + # HTTP Latency (Duration) + - name: http_latency + interval: 15s + rules: + # p50 latency (median) + - record: greyhaven:http_latency:p50 + expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le)) + + # p95 latency + - record: greyhaven:http_latency:p95 + expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le)) + + # p99 latency + - record: greyhaven:http_latency:p99 + expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le)) + + # Average latency + - record: greyhaven:http_latency:avg + expr: | + sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m])) + / + sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m])) + + # p95 latency by endpoint + - record: greyhaven:http_latency:p95:by_endpoint + expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint)) + + # Resource Saturation + - name: resource_saturation + interval: 15s + rules: + # CPU usage percentage + - record: greyhaven:cpu_usage:percent + expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + + # Memory usage percentage + - record: greyhaven:memory_usage:percent + expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) + + # Disk usage percentage + - record: greyhaven:disk_usage:percent + expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100) + + # Database connection pool saturation + - record: greyhaven:db_pool:saturation + expr: | + db_pool_connections_active{service="YOUR_SERVICE"} + / + db_pool_connections_max{service="YOUR_SERVICE"} + + # SLI Calculations (Multi-Window) + - name: sli_calculations + interval: 30s + rules: + # Availability SLI - 1 hour window + - record: greyhaven:sli:availability:1h + expr: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h])) + / + sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h])) + + # Availability SLI - 6 hour window + - record: greyhaven:sli:availability:6h + expr: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h])) + / + sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h])) + + # Availability SLI - 24 hour window + - record: greyhaven:sli:availability:24h + expr: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h])) + / + sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h])) + + # Availability SLI - 30 day window + - record: greyhaven:sli:availability:30d + expr: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d])) + / + sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d])) + + # Latency SLI - 1 hour window (% requests < 200ms) + - record: greyhaven:sli:latency:1h + expr: | + sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h])) + / + sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h])) + + # Latency SLI - 30 day window + - record: greyhaven:sli:latency:30d + expr: | + sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d])) + / + sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d])) + + # Error Budget Tracking + - name: error_budget + interval: 30s + rules: + # Error budget remaining (for 99.9% SLO) + - record: greyhaven:error_budget:remaining:30d + expr: | + 1 - ( + (1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) + / + (1 - 0.999) + ) + + # Error budget burn rate - 1 hour window + - record: greyhaven:error_budget:burn_rate:1h + expr: | + (1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"}) + / + (1 - 0.999) + + # Error budget burn rate - 6 hour window + - record: greyhaven:error_budget:burn_rate:6h + expr: | + (1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"}) + / + (1 - 0.999) + + # Error budget burn rate - 24 hour window + - record: greyhaven:error_budget:burn_rate:24h + expr: | + (1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"}) + / + (1 - 0.999) + + # Error budget consumed (minutes of downtime) + - record: greyhaven:error_budget:consumed:30d + expr: | + (1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200 diff --git a/skills/observability-engineering/templates/slo-definition.yaml b/skills/observability-engineering/templates/slo-definition.yaml new file mode 100644 index 0000000..286c753 --- /dev/null +++ b/skills/observability-engineering/templates/slo-definition.yaml @@ -0,0 +1,173 @@ +# SLO Definition Template +# Replace YOUR_SERVICE with actual service name +# Replace 99.9 with your target SLO (99.5, 99.9, or 99.95) + +apiVersion: monitoring.greyhaven.io/v1 +kind: ServiceLevelObjective +metadata: + name: YOUR_SERVICE-slo + namespace: production +spec: + # Service identification + service: YOUR_SERVICE + environment: production + + # SLO tier (critical, essential, standard) + tier: essential + + # Time window (30 days recommended) + window: 30d + + # SLO targets + objectives: + - name: availability + target: 99.9 # 99.9% = 43.2 min downtime/month + indicator: + type: ratio + success_query: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[{{.window}}])) + total_query: | + sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}])) + + - name: latency + target: 95 # 95% of requests < 200ms + indicator: + type: ratio + success_query: | + sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[{{.window}}])) + total_query: | + sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[{{.window}}])) + + - name: error_rate + target: 99.5 # <0.5% error rate + indicator: + type: ratio + success_query: | + sum(rate(http_requests_total{service="YOUR_SERVICE",status!~"5.."}[{{.window}}])) + total_query: | + sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}])) + + # Error budget policy + errorBudget: + policy: + - budget_range: [75%, 100%] + action: "Normal feature development" + approval: "Engineering team" + + - budget_range: [50%, 75%] + action: "Monitor closely, increase testing" + approval: "Engineering team" + + - budget_range: [25%, 50%] + action: "Prioritize reliability work, reduce risky changes" + approval: "Engineering manager" + + - budget_range: [0%, 25%] + action: "Feature freeze, all hands on reliability" + approval: "VP Engineering" + requirements: + - "Daily reliability standup" + - "Postmortem for all incidents" + - "No new features until budget >50%" + + - budget_range: [0%, 0%] + action: "SLO violation - mandatory postmortem" + approval: "VP Engineering + CTO" + requirements: + - "Complete postmortem within 48 hours" + - "Action items with owners and deadlines" + - "Present to exec team" + + # Multi-window burn rate alerts + alerts: + - name: error-budget-burn-rate-critical + severity: critical + windows: + short: 1h + long: 6h + burn_rate_threshold: 14.4 # Budget exhausted in 2 hours + for: 2m + annotations: + summary: "Critical burn rate - budget exhausted in 2 hours" + description: "Service {{ $labels.service }} is burning error budget 14.4x faster than expected" + runbook: "https://runbooks.greyhaven.io/slo-burn-rate" + notifications: + - type: pagerduty + severity: critical + + - name: error-budget-burn-rate-high + severity: warning + windows: + short: 6h + long: 24h + burn_rate_threshold: 6 # Budget exhausted in 5 days + for: 15m + annotations: + summary: "High burn rate - budget exhausted in 5 days" + description: "Service {{ $labels.service }} is burning error budget 6x faster than expected" + notifications: + - type: slack + channel: "#alerts-reliability" + + - name: error-budget-burn-rate-medium + severity: warning + windows: + short: 24h + long: 24h + burn_rate_threshold: 3 # Budget exhausted in 10 days + for: 1h + annotations: + summary: "Medium burn rate - budget exhausted in 10 days" + notifications: + - type: slack + channel: "#alerts-reliability" + + - name: error-budget-low + severity: warning + threshold: 0.25 # 25% remaining + for: 5m + annotations: + summary: "Error budget low ({{ $value | humanizePercentage }} remaining)" + description: "Consider feature freeze per error budget policy" + notifications: + - type: slack + channel: "#engineering-managers" + + - name: error-budget-depleted + severity: critical + threshold: 0 # 0% remaining + for: 5m + annotations: + summary: "Error budget depleted - feature freeze required" + description: "SLO violated. Postmortem required within 48 hours." + notifications: + - type: pagerduty + severity: critical + - type: slack + channel: "#exec-alerts" + + # Review cadence + review: + frequency: weekly + participants: + - team: engineering + - team: product + - team: sre + agenda: + - "Current error budget status" + - "Burn rate trends" + - "Recent incidents and impact" + - "Upcoming risky changes" + + # Reporting + reporting: + dashboard: + grafana_uid: YOUR_SERVICE_slo_dashboard + panels: + - slo_status + - error_budget_remaining + - burn_rate_multiwindow + - incident_timeline + export: + format: prometheus + recording_rules: true diff --git a/skills/observability-monitoring/SKILL.md b/skills/observability-monitoring/SKILL.md new file mode 100644 index 0000000..28b868b --- /dev/null +++ b/skills/observability-monitoring/SKILL.md @@ -0,0 +1,413 @@ +--- +name: observability-monitoring +description: Implement observability and monitoring using Cloudflare Workers Analytics, wrangler tail for logs, and health checks. Use when setting up monitoring, implementing logging, configuring alerts, or debugging production issues. +--- + +# Grey Haven Observability and Monitoring + +Implement comprehensive monitoring for Grey Haven applications using **Cloudflare Workers** built-in observability tools. + +## Observability Stack + +### Grey Haven Monitoring Architecture + +- **Logging**: Cloudflare Workers logs + wrangler tail +- **Metrics**: Cloudflare Workers Analytics dashboard +- **Custom Events**: Cloudflare Analytics Engine +- **Health Checks**: Cloudflare Health Checks for endpoint availability +- **Error Tracking**: Console errors visible in Cloudflare dashboard + +## Cloudflare Workers Logging + +### Console Logging in Workers + +```typescript +// app/utils/logger.ts +export interface LogEvent { + level: "debug" | "info" | "warn" | "error"; + message: string; + context?: Record; + userId?: string; + tenantId?: string; + requestId?: string; + duration?: number; +} + +export function log(event: LogEvent) { + const logData = { + timestamp: new Date().toISOString(), + level: event.level, + message: event.message, + environment: process.env.ENVIRONMENT, + user_id: event.userId, + tenant_id: event.tenantId, + request_id: event.requestId, + duration_ms: event.duration, + ...event.context, + }; + + // Structured console logging (visible in Cloudflare dashboard) + console[event.level](JSON.stringify(logData)); +} + +// Convenience methods +export const logger = { + debug: (message: string, context?: Record) => + log({ level: "debug", message, context }), + info: (message: string, context?: Record) => + log({ level: "info", message, context }), + warn: (message: string, context?: Record) => + log({ level: "warn", message, context }), + error: (message: string, context?: Record) => + log({ level: "error", message, context }), +}; +``` + +### Logging Middleware + +```typescript +// app/middleware/logging.ts +import { logger } from "~/utils/logger"; +import { v4 as uuidv4 } from "uuid"; + +export async function loggingMiddleware( + request: Request, + next: () => Promise +) { + const requestId = uuidv4(); + const startTime = Date.now(); + + try { + const response = await next(); + const duration = Date.now() - startTime; + + logger.info("Request completed", { + request_id: requestId, + method: request.method, + url: request.url, + status: response.status, + duration_ms: duration, + }); + + return response; + } catch (error) { + const duration = Date.now() - startTime; + + logger.error("Request failed", { + request_id: requestId, + method: request.method, + url: request.url, + error: error.message, + stack: error.stack, + duration_ms: duration, + }); + + throw error; + } +} +``` + +## Cloudflare Workers Analytics + +### Workers Analytics Dashboard + +Access metrics at: `https://dash.cloudflare.com → Workers → Analytics` + +**Key Metrics**: +- Request rate (requests/second) +- CPU time (milliseconds) +- Error rate (%) +- Success rate (%) +- Response time (P50, P95, P99) +- Invocations per day +- GB-seconds (compute usage) + +### Wrangler Tail (Real-time Logs) + +```bash +# Stream production logs +npx wrangler tail --config wrangler.production.toml + +# Filter by status code +npx wrangler tail --status error --config wrangler.production.toml + +# Filter by method +npx wrangler tail --method POST --config wrangler.production.toml + +# Filter by IP address +npx wrangler tail --ip 1.2.3.4 --config wrangler.production.toml + +# Output to file +npx wrangler tail --config wrangler.production.toml > logs.txt +``` + +### Accessing Logs in Cloudflare Dashboard + +1. Go to `https://dash.cloudflare.com` +2. Navigate to Workers & Pages +3. Select your Worker +4. Click "Logs" tab +5. View real-time logs with filtering + +**Log Features**: +- Real-time streaming +- Filter by status code +- Filter by request method +- Search log content +- Export logs (JSON) + +## Analytics Engine (Custom Events) + +### Setup Analytics Engine + +**wrangler.toml**: +```toml +[[analytics_engine_datasets]] +binding = "ANALYTICS" +``` + +### Track Custom Events + +```typescript +// app/utils/analytics.ts +export async function trackEvent( + env: Env, + eventName: string, + data: { + user_id?: string; + tenant_id?: string; + duration_ms?: number; + [key: string]: string | number | undefined; + } +) { + try { + await env.ANALYTICS.writeDataPoint({ + blobs: [eventName], + doubles: [data.duration_ms || 0], + indexes: [data.user_id || "", data.tenant_id || ""], + }); + } catch (error) { + console.error("Failed to track event:", error); + } +} + +// Usage in server function +export const loginUser = createServerFn({ method: "POST" }).handler( + async ({ data, context }) => { + const startTime = Date.now(); + const user = await authenticateUser(data); + const duration = Date.now() - startTime; + + // Track login event + await trackEvent(context.env, "user_login", { + user_id: user.id, + tenant_id: user.tenantId, + duration_ms: duration, + }); + + return user; + } +); +``` + +### Query Analytics Data + +Use Cloudflare GraphQL API: + +```graphql +query GetLoginStats { + viewer { + accounts(filter: { accountTag: $accountId }) { + workersAnalyticsEngineDataset(dataset: "my_analytics") { + query( + filter: { + blob1: "user_login" + datetime_gt: "2025-01-01T00:00:00Z" + } + ) { + count + dimensions { + blob1 # event name + index1 # user_id + index2 # tenant_id + } + } + } + } + } +} +``` + +## Health Checks + +### Health Check Endpoint + +```typescript +// app/routes/api/health.ts +import { createServerFn } from "@tanstack/start"; +import { db } from "~/lib/server/db"; + +export const GET = createServerFn({ method: "GET" }).handler(async ({ context }) => { + const startTime = Date.now(); + const checks: Record = {}; + + // Check database + let dbHealthy = false; + try { + await db.execute("SELECT 1"); + dbHealthy = true; + checks.database = "ok"; + } catch (error) { + console.error("Database health check failed:", error); + checks.database = "failed"; + } + + // Check Redis (if using Upstash) + let redisHealthy = false; + if (context.env.REDIS) { + try { + await context.env.REDIS.ping(); + redisHealthy = true; + checks.redis = "ok"; + } catch (error) { + console.error("Redis health check failed:", error); + checks.redis = "failed"; + } + } + + const duration = Date.now() - startTime; + const healthy = dbHealthy && (!context.env.REDIS || redisHealthy); + + return new Response( + JSON.stringify({ + status: healthy ? "healthy" : "unhealthy", + checks, + duration_ms: duration, + timestamp: new Date().toISOString(), + environment: process.env.ENVIRONMENT, + }), + { + status: healthy ? 200 : 503, + headers: { "Content-Type": "application/json" }, + } + ); +}); +``` + +### Cloudflare Health Checks + +Configure in Cloudflare dashboard: + +1. Go to Traffic → Health Checks +2. Create health check for `/api/health` +3. Configure: + - Interval: 60 seconds + - Timeout: 5 seconds + - Retries: 2 + - Expected status: 200 +4. Set up notifications (email/webhook) + +## Error Tracking + +### Structured Error Logging + +```typescript +// app/utils/error-handler.ts +import { logger } from "~/utils/logger"; + +export function handleError(error: Error, context?: Record) { + // Log error with full context + logger.error(error.message, { + error_name: error.name, + stack: error.stack, + ...context, + }); + + // Also log to Analytics Engine for tracking + if (context?.env) { + trackEvent(context.env as Env, "error_occurred", { + error_name: error.name, + error_message: error.message, + }); + } +} + +// Usage in server function +export const updateUser = createServerFn({ method: "POST" }).handler( + async ({ data, context }) => { + try { + return await userService.update(data); + } catch (error) { + handleError(error, { + user_id: context.user?.id, + tenant_id: context.tenant?.id, + env: context.env, + }); + throw error; + } + } +); +``` + +### Viewing Errors in Cloudflare + +1. **Workers Dashboard**: View errors in real-time +2. **Wrangler Tail**: `npx wrangler tail --status error` +3. **Analytics**: Check error rate metrics +4. **Health Checks**: Monitor endpoint failures + +## Supporting Documentation + +All supporting files are under 500 lines per Anthropic best practices: + +- **[examples/](examples/)** - Complete monitoring examples + - [cloudflare-logging.md](examples/cloudflare-logging.md) - Structured console logging + - [wrangler-tail.md](examples/wrangler-tail.md) - Real-time log streaming + - [analytics-engine.md](examples/analytics-engine.md) - Custom event tracking + - [health-checks.md](examples/health-checks.md) - Health check implementations + - [error-tracking.md](examples/error-tracking.md) - Error handling patterns + - [INDEX.md](examples/INDEX.md) - Examples navigation + +- **[reference/](reference/)** - Monitoring references + - [cloudflare-metrics.md](reference/cloudflare-metrics.md) - Available metrics + - [wrangler-commands.md](reference/wrangler-commands.md) - Wrangler CLI reference + - [alert-configuration.md](reference/alert-configuration.md) - Setting up alerts + - [INDEX.md](reference/INDEX.md) - Reference navigation + +- **[templates/](templates/)** - Copy-paste ready templates + - [logger.ts](templates/logger.ts) - Cloudflare logger template + - [health-check.ts](templates/health-check.ts) - Health check endpoint + +- **[checklists/](checklists/)** - Monitoring checklists + - [observability-setup-checklist.md](checklists/observability-setup-checklist.md) - Setup checklist + +## When to Apply This Skill + +Use this skill when: +- Setting up monitoring for new Cloudflare Workers projects +- Implementing structured logging with console +- Debugging production issues with wrangler tail +- Setting up health checks +- Implementing custom metrics tracking with Analytics Engine +- Configuring Cloudflare alerts + +## Template Reference + +These patterns are from Grey Haven's production monitoring: +- **Cloudflare Workers Analytics**: Request and performance metrics +- **Wrangler tail**: Real-time log streaming +- **Console logging**: Structured JSON logs +- **Analytics Engine**: Custom event tracking + +## Critical Reminders + +1. **Structured logging**: Use JSON.stringify for console logs +2. **Request IDs**: Track requests with UUIDs for debugging +3. **Error context**: Include tenant_id, user_id in all error logs +4. **Health checks**: Monitor database and external service connections +5. **Wrangler tail**: Use filters to narrow down logs (--status, --method) +6. **Performance**: Track duration_ms for all operations +7. **Environment**: Log environment in all messages for filtering +8. **Analytics Engine**: Use for custom metrics and event tracking +9. **Dashboard access**: Logs available in Cloudflare Workers dashboard +10. **Real-time debugging**: Use wrangler tail for live production debugging diff --git a/skills/observability-monitoring/examples/INDEX.md b/skills/observability-monitoring/examples/INDEX.md new file mode 100644 index 0000000..5d0d1a9 --- /dev/null +++ b/skills/observability-monitoring/examples/INDEX.md @@ -0,0 +1,48 @@ +# Observability Examples + +Complete monitoring and logging examples for Grey Haven Cloudflare Workers applications. + +## Available Examples + +### [cloudflare-logging.md](cloudflare-logging.md) +Structured console logging for Cloudflare Workers. +- TypeScript logger implementation +- Log levels and context +- JSON structured logging +- Viewing logs in Cloudflare dashboard + +### [wrangler-tail.md](wrangler-tail.md) +Real-time log streaming with wrangler tail. +- Streaming production logs +- Filtering by status, method, IP +- Exporting logs to files +- Common troubleshooting patterns + +### [analytics-engine.md](analytics-engine.md) +Cloudflare Analytics Engine for custom events. +- Custom event tracking +- Analytics Engine API +- GraphQL queries for analytics +- Event aggregation patterns + +### [health-checks.md](health-checks.md) +Health check endpoint implementations. +- Database health checks +- Redis health checks +- Multi-service health checks +- Cloudflare health check configuration + +### [error-tracking.md](error-tracking.md) +Error handling and tracking patterns. +- Structured error logging +- Error context tracking +- Analytics Engine for error metrics +- Error rate monitoring + +## Quick Reference + +**Need logging?** → [cloudflare-logging.md](cloudflare-logging.md) +**Need real-time logs?** → [wrangler-tail.md](wrangler-tail.md) +**Need custom metrics?** → [analytics-engine.md](analytics-engine.md) +**Need health checks?** → [health-checks.md](health-checks.md) +**Need error tracking?** → [error-tracking.md](error-tracking.md) diff --git a/skills/observability-monitoring/reference/INDEX.md b/skills/observability-monitoring/reference/INDEX.md new file mode 100644 index 0000000..3b01d5a --- /dev/null +++ b/skills/observability-monitoring/reference/INDEX.md @@ -0,0 +1,32 @@ +# Observability Reference + +Configuration references and patterns for Cloudflare Workers monitoring. + +## Available References + +### [cloudflare-metrics.md](cloudflare-metrics.md) +Available Cloudflare Workers metrics. +- Workers Analytics metrics +- Analytics Engine metrics +- Health check metrics +- Performance metrics (CPU time, response time) + +### [wrangler-commands.md](wrangler-commands.md) +Wrangler CLI commands for monitoring. +- wrangler tail reference +- Log filtering options +- Output formatting +- Common command patterns + +### [alert-configuration.md](alert-configuration.md) +Setting up alerts and notifications. +- Cloudflare health check alerts +- Email notifications +- Webhook integration +- Alert thresholds + +## Quick Reference + +**Need metrics?** → [cloudflare-metrics.md](cloudflare-metrics.md) +**Need CLI commands?** → [wrangler-commands.md](wrangler-commands.md) +**Need alerts?** → [alert-configuration.md](alert-configuration.md)