commit 23753b435ebb546ce8ff29f24321588dec004e0e Author: Zhongwei Li Date: Sat Nov 29 17:51:22 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..6cdaf33 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "monitoring-observability", + "description": "Monitoring and observability strategy, metrics/logs/traces systems, SLOs/error budgets, Prometheus/Grafana/Loki, OpenTelemetry, and tool comparison", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Ahmad Asmar", + "email": "zhongweili@tubi.tv" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d417535 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# monitoring-observability + +Monitoring and observability strategy, metrics/logs/traces systems, SLOs/error budgets, Prometheus/Grafana/Loki, OpenTelemetry, and tool comparison diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..e1594d1 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,869 @@ +--- +name: monitoring-observability +description: Monitoring and observability strategy, implementation, and troubleshooting. Use for designing metrics/logs/traces systems, setting up Prometheus/Grafana/Loki, creating alerts and dashboards, calculating SLOs and error budgets, analyzing performance issues, and comparing monitoring tools (Datadog, ELK, CloudWatch). Covers the Four Golden Signals, RED/USE methods, OpenTelemetry instrumentation, log aggregation patterns, and distributed tracing. +--- + +# Monitoring & Observability + +## Overview + +This skill provides comprehensive guidance for monitoring and observability workflows including metrics design, log aggregation, distributed tracing, alerting strategies, SLO/SLA management, and tool selection. + +**When to use this skill**: +- Setting up monitoring for new services +- Designing alerts and dashboards +- Troubleshooting performance issues +- Implementing SLO tracking and error budgets +- Choosing between monitoring tools +- Integrating OpenTelemetry instrumentation +- Analyzing metrics, logs, and traces +- Optimizing Datadog costs and finding waste +- Migrating from Datadog to open-source stack + +--- + +## Core Workflow: Observability Implementation + +Use this decision tree to determine your starting point: + +``` +Are you setting up monitoring from scratch? +├─ YES → Start with "1. Design Metrics Strategy" +└─ NO → Do you have an existing issue? + ├─ YES → Go to "9. Troubleshooting & Analysis" + └─ NO → Are you improving existing monitoring? + ├─ Alerts → Go to "3. Alert Design" + ├─ Dashboards → Go to "4. Dashboard & Visualization" + ├─ SLOs → Go to "5. SLO & Error Budgets" + ├─ Tool selection → Read references/tool_comparison.md + └─ Using Datadog? High costs? → Go to "7. Datadog Cost Optimization & Migration" +``` + +--- + +## 1. Design Metrics Strategy + +### Start with The Four Golden Signals + +Every service should monitor: + +1. **Latency**: Response time (p50, p95, p99) +2. **Traffic**: Requests per second +3. **Errors**: Failure rate +4. **Saturation**: Resource utilization + +**For request-driven services**, use the **RED Method**: +- **R**ate: Requests/sec +- **E**rrors: Error rate +- **D**uration: Response time + +**For infrastructure resources**, use the **USE Method**: +- **U**tilization: % time busy +- **S**aturation**: Queue depth +- **E**rrors**: Error count + +**Quick Start - Web Application Example**: +```promql +# Rate (requests/sec) +sum(rate(http_requests_total[5m])) + +# Errors (error rate %) +sum(rate(http_requests_total{status=~"5.."}[5m])) + / +sum(rate(http_requests_total[5m])) * 100 + +# Duration (p95 latency) +histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) +) +``` + +### Deep Dive: Metric Design + +For comprehensive metric design guidance including: +- Metric types (counter, gauge, histogram, summary) +- Cardinality best practices +- Naming conventions +- Dashboard design principles + +**→ Read**: [references/metrics_design.md](references/metrics_design.md) + +### Automated Metric Analysis + +Detect anomalies and trends in your metrics: + +```bash +# Analyze Prometheus metrics for anomalies +python3 scripts/analyze_metrics.py prometheus \ + --endpoint http://localhost:9090 \ + --query 'rate(http_requests_total[5m])' \ + --hours 24 + +# Analyze CloudWatch metrics +python3 scripts/analyze_metrics.py cloudwatch \ + --namespace AWS/EC2 \ + --metric CPUUtilization \ + --dimensions InstanceId=i-1234567890abcdef0 \ + --hours 48 +``` + +**→ Script**: [scripts/analyze_metrics.py](scripts/analyze_metrics.py) + +--- + +## 2. Log Aggregation & Analysis + +### Structured Logging Checklist + +Every log entry should include: +- ✅ Timestamp (ISO 8601 format) +- ✅ Log level (DEBUG, INFO, WARN, ERROR, FATAL) +- ✅ Message (human-readable) +- ✅ Service name +- ✅ Request ID (for tracing) + +**Example structured log (JSON)**: +```json +{ + "timestamp": "2024-10-28T14:32:15Z", + "level": "error", + "message": "Payment processing failed", + "service": "payment-service", + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "user_id": "user123", + "order_id": "ORD-456", + "error_type": "GatewayTimeout", + "duration_ms": 5000 +} +``` + +### Log Aggregation Patterns + +**ELK Stack** (Elasticsearch, Logstash, Kibana): +- Best for: Deep log analysis, complex queries +- Cost: High (infrastructure + operations) +- Complexity: High + +**Grafana Loki**: +- Best for: Cost-effective logging, Kubernetes +- Cost: Low +- Complexity: Medium + +**CloudWatch Logs**: +- Best for: AWS-centric applications +- Cost: Medium +- Complexity: Low + +### Log Analysis + +Analyze logs for errors, patterns, and anomalies: + +```bash +# Analyze log file for patterns +python3 scripts/log_analyzer.py application.log + +# Show error lines with context +python3 scripts/log_analyzer.py application.log --show-errors + +# Extract stack traces +python3 scripts/log_analyzer.py application.log --show-traces +``` + +**→ Script**: [scripts/log_analyzer.py](scripts/log_analyzer.py) + +### Deep Dive: Logging + +For comprehensive logging guidance including: +- Structured logging implementation examples (Python, Node.js, Go, Java) +- Log aggregation patterns (ELK, Loki, CloudWatch, Fluentd) +- Query patterns and best practices +- PII redaction and security +- Sampling and rate limiting + +**→ Read**: [references/logging_guide.md](references/logging_guide.md) + +--- + +## 3. Alert Design + +### Alert Design Principles + +1. **Every alert must be actionable** - If you can't do something, don't alert +2. **Alert on symptoms, not causes** - Alert on user experience, not components +3. **Tie alerts to SLOs** - Connect to business impact +4. **Reduce noise** - Only page for critical issues + +### Alert Severity Levels + +| Severity | Response Time | Example | +|----------|--------------|---------| +| **Critical** | Page immediately | Service down, SLO violation | +| **Warning** | Ticket, review in hours | Elevated error rate, resource warning | +| **Info** | Log for awareness | Deployment completed, scaling event | + +### Multi-Window Burn Rate Alerting + +Alert when error budget is consumed too quickly: + +```yaml +# Fast burn (1h window) - Critical +- alert: ErrorBudgetFastBurn + expr: | + (error_rate / 0.001) > 14.4 # 99.9% SLO + for: 2m + labels: + severity: critical + +# Slow burn (6h window) - Warning +- alert: ErrorBudgetSlowBurn + expr: | + (error_rate / 0.001) > 6 # 99.9% SLO + for: 30m + labels: + severity: warning +``` + +### Alert Quality Checker + +Audit your alert rules against best practices: + +```bash +# Check single file +python3 scripts/alert_quality_checker.py alerts.yml + +# Check all rules in directory +python3 scripts/alert_quality_checker.py /path/to/prometheus/rules/ +``` + +**Checks for**: +- Alert naming conventions +- Required labels (severity, team) +- Required annotations (summary, description, runbook_url) +- PromQL expression quality +- 'for' clause to prevent flapping + +**→ Script**: [scripts/alert_quality_checker.py](scripts/alert_quality_checker.py) + +### Alert Templates + +Production-ready alert rule templates: + +**→ Templates**: +- [assets/templates/prometheus-alerts/webapp-alerts.yml](assets/templates/prometheus-alerts/webapp-alerts.yml) - Web application alerts +- [assets/templates/prometheus-alerts/kubernetes-alerts.yml](assets/templates/prometheus-alerts/kubernetes-alerts.yml) - Kubernetes alerts + +### Deep Dive: Alerting + +For comprehensive alerting guidance including: +- Alert design patterns (multi-window, rate of change, threshold with hysteresis) +- Alert annotation best practices +- Alert routing (severity-based, team-based, time-based) +- Inhibition rules +- Runbook structure +- On-call best practices + +**→ Read**: [references/alerting_best_practices.md](references/alerting_best_practices.md) + +### Runbook Template + +Create comprehensive runbooks for your alerts: + +**→ Template**: [assets/templates/runbooks/incident-runbook-template.md](assets/templates/runbooks/incident-runbook-template.md) + +--- + +## 4. Dashboard & Visualization + +### Dashboard Design Principles + +1. **Top-down layout**: Most important metrics first +2. **Color coding**: Red (critical), yellow (warning), green (healthy) +3. **Consistent time windows**: All panels use same time range +4. **Limit panels**: 8-12 panels per dashboard maximum +5. **Include context**: Show related metrics together + +### Recommended Dashboard Structure + +``` +┌─────────────────────────────────────┐ +│ Overall Health (Single Stats) │ +│ [Requests/s] [Error%] [P95 Latency]│ +└─────────────────────────────────────┘ +┌─────────────────────────────────────┐ +│ Request Rate & Errors (Graphs) │ +└─────────────────────────────────────┘ +┌─────────────────────────────────────┐ +│ Latency Distribution (Graphs) │ +└─────────────────────────────────────┘ +┌─────────────────────────────────────┐ +│ Resource Usage (Graphs) │ +└─────────────────────────────────────┘ +``` + +### Generate Grafana Dashboards + +Automatically generate dashboards from templates: + +```bash +# Web application dashboard +python3 scripts/dashboard_generator.py webapp \ + --title "My API Dashboard" \ + --service my_api \ + --output dashboard.json + +# Kubernetes dashboard +python3 scripts/dashboard_generator.py kubernetes \ + --title "K8s Production" \ + --namespace production \ + --output k8s-dashboard.json + +# Database dashboard +python3 scripts/dashboard_generator.py database \ + --title "PostgreSQL" \ + --db-type postgres \ + --instance db.example.com:5432 \ + --output db-dashboard.json +``` + +**Supports**: +- Web applications (requests, errors, latency, resources) +- Kubernetes (pods, nodes, resources, network) +- Databases (PostgreSQL, MySQL) + +**→ Script**: [scripts/dashboard_generator.py](scripts/dashboard_generator.py) + +--- + +## 5. SLO & Error Budgets + +### SLO Fundamentals + +**SLI** (Service Level Indicator): Measurement of service quality +- Example: Request latency, error rate, availability + +**SLO** (Service Level Objective): Target value for an SLI +- Example: "99.9% of requests return in < 500ms" + +**Error Budget**: Allowed failure amount = (100% - SLO) +- Example: 99.9% SLO = 0.1% error budget = 43.2 minutes/month + +### Common SLO Targets + +| Availability | Downtime/Month | Use Case | +|--------------|----------------|----------| +| **99%** | 7.2 hours | Internal tools | +| **99.9%** | 43.2 minutes | Standard production | +| **99.95%** | 21.6 minutes | Critical services | +| **99.99%** | 4.3 minutes | High availability | + +### SLO Calculator + +Calculate compliance, error budgets, and burn rates: + +```bash +# Show SLO reference table +python3 scripts/slo_calculator.py --table + +# Calculate availability SLO +python3 scripts/slo_calculator.py availability \ + --slo 99.9 \ + --total-requests 1000000 \ + --failed-requests 1500 \ + --period-days 30 + +# Calculate burn rate +python3 scripts/slo_calculator.py burn-rate \ + --slo 99.9 \ + --errors 50 \ + --requests 10000 \ + --window-hours 1 +``` + +**→ Script**: [scripts/slo_calculator.py](scripts/slo_calculator.py) + +### Deep Dive: SLO/SLA + +For comprehensive SLO/SLA guidance including: +- Choosing appropriate SLIs +- Setting realistic SLO targets +- Error budget policies +- Burn rate alerting +- SLA structure and contracts +- Monthly reporting templates + +**→ Read**: [references/slo_sla_guide.md](references/slo_sla_guide.md) + +--- + +## 6. Distributed Tracing + +### When to Use Tracing + +Use distributed tracing when you need to: +- Debug performance issues across services +- Understand request flow through microservices +- Identify bottlenecks in distributed systems +- Find N+1 query problems + +### OpenTelemetry Implementation + +**Python example**: +```python +from opentelemetry import trace + +tracer = trace.get_tracer(__name__) + +@tracer.start_as_current_span("process_order") +def process_order(order_id): + span = trace.get_current_span() + span.set_attribute("order.id", order_id) + + try: + result = payment_service.charge(order_id) + span.set_attribute("payment.status", "success") + return result + except Exception as e: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + span.record_exception(e) + raise +``` + +### Sampling Strategies + +- **Development**: 100% (ALWAYS_ON) +- **Staging**: 50-100% +- **Production**: 1-10% (or error-based sampling) + +**Error-based sampling** (always sample errors, 1% of successes): +```python +class ErrorSampler(Sampler): + def should_sample(self, parent_context, trace_id, name, **kwargs): + attributes = kwargs.get('attributes', {}) + + if attributes.get('error', False): + return Decision.RECORD_AND_SAMPLE + + if trace_id & 0xFF < 3: # ~1% + return Decision.RECORD_AND_SAMPLE + + return Decision.DROP +``` + +### OTel Collector Configuration + +Production-ready OpenTelemetry Collector configuration: + +**→ Template**: [assets/templates/otel-config/collector-config.yaml](assets/templates/otel-config/collector-config.yaml) + +**Features**: +- Receives OTLP, Prometheus, and host metrics +- Batching and memory limiting +- Tail sampling (error-based, latency-based, probabilistic) +- Multiple exporters (Tempo, Jaeger, Loki, Prometheus, CloudWatch, Datadog) + +### Deep Dive: Tracing + +For comprehensive tracing guidance including: +- OpenTelemetry instrumentation (Python, Node.js, Go, Java) +- Span attributes and semantic conventions +- Context propagation (W3C Trace Context) +- Backend comparison (Jaeger, Tempo, X-Ray, Datadog APM) +- Analysis patterns (finding slow traces, N+1 queries) +- Integration with logs + +**→ Read**: [references/tracing_guide.md](references/tracing_guide.md) + +--- + +## 7. Datadog Cost Optimization & Migration + +### Scenario 1: I'm Using Datadog and Costs Are Too High + +If your Datadog bill is growing out of control, start by identifying waste: + +#### Cost Analysis Script + +Automatically analyze your Datadog usage and find cost optimization opportunities: + +```bash +# Analyze Datadog usage (requires API key and APP key) +python3 scripts/datadog_cost_analyzer.py \ + --api-key $DD_API_KEY \ + --app-key $DD_APP_KEY + +# Show detailed breakdown by category +python3 scripts/datadog_cost_analyzer.py \ + --api-key $DD_API_KEY \ + --app-key $DD_APP_KEY \ + --show-details +``` + +**What it checks**: +- Infrastructure host count and cost +- Custom metrics usage and high-cardinality metrics +- Log ingestion volume and trends +- APM host usage +- Unused or noisy monitors +- Container vs VM optimization opportunities + +**→ Script**: [scripts/datadog_cost_analyzer.py](scripts/datadog_cost_analyzer.py) + +#### Common Cost Optimization Strategies + +**1. Custom Metrics Optimization** (typical savings: 20-40%): +- Remove high-cardinality tags (user IDs, request IDs) +- Delete unused custom metrics +- Aggregate metrics before sending +- Use metric prefixes to identify teams/services + +**2. Log Management** (typical savings: 30-50%): +- Implement log sampling for high-volume services +- Use exclusion filters for debug/trace logs in production +- Archive cold logs to S3/GCS after 7 days +- Set log retention policies (15 days instead of 30) + +**3. APM Optimization** (typical savings: 15-25%): +- Reduce trace sampling rates (10% → 5% in prod) +- Use head-based sampling instead of complete sampling +- Remove APM from non-critical services +- Use trace search with lower retention + +**4. Infrastructure Monitoring** (typical savings: 10-20%): +- Switch from VM-based to container-based pricing where possible +- Remove agents from ephemeral instances +- Use Datadog's host reduction strategies +- Consolidate staging environments + +### Scenario 2: Migrating Away from Datadog + +If you're considering migrating to a more cost-effective open-source stack: + +#### Migration Overview + +**From Datadog** → **To Open Source Stack**: +- Metrics: Datadog → **Prometheus + Grafana** +- Logs: Datadog Logs → **Grafana Loki** +- Traces: Datadog APM → **Tempo or Jaeger** +- Dashboards: Datadog → **Grafana** +- Alerts: Datadog Monitors → **Prometheus Alertmanager** + +**Estimated Cost Savings**: 60-77% ($49.8k-61.8k/year for 100-host environment) + +#### Migration Strategy + +**Phase 1: Run Parallel** (Month 1-2): +- Deploy open-source stack alongside Datadog +- Migrate metrics first (lowest risk) +- Validate data accuracy + +**Phase 2: Migrate Dashboards & Alerts** (Month 2-3): +- Convert Datadog dashboards to Grafana +- Translate alert rules (use DQL → PromQL guide below) +- Train team on new tools + +**Phase 3: Migrate Logs & Traces** (Month 3-4): +- Set up Loki for log aggregation +- Deploy Tempo/Jaeger for tracing +- Update application instrumentation + +**Phase 4: Decommission Datadog** (Month 4-5): +- Confirm all functionality migrated +- Cancel Datadog subscription + +#### Query Translation: DQL → PromQL + +When migrating dashboards and alerts, you'll need to translate Datadog queries to PromQL: + +**Quick examples**: +``` +# Average CPU +Datadog: avg:system.cpu.user{*} +Prometheus: avg(node_cpu_seconds_total{mode="user"}) + +# Request rate +Datadog: sum:requests.count{*}.as_rate() +Prometheus: sum(rate(http_requests_total[5m])) + +# P95 latency +Datadog: p95:request.duration{*} +Prometheus: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +# Error rate percentage +Datadog: (sum:requests.errors{*}.as_rate() / sum:requests.count{*}.as_rate()) * 100 +Prometheus: (sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100 +``` + +**→ Full Translation Guide**: [references/dql_promql_translation.md](references/dql_promql_translation.md) + +#### Cost Comparison + +**Example: 100-host infrastructure** + +| Component | Datadog (Annual) | Open Source (Annual) | Savings | +|-----------|-----------------|---------------------|---------| +| Infrastructure | $18,000 | $10,000 (self-hosted infra) | $8,000 | +| Custom Metrics | $600 | Included | $600 | +| Logs | $24,000 | $3,000 (storage) | $21,000 | +| APM/Traces | $37,200 | $5,000 (storage) | $32,200 | +| **Total** | **$79,800** | **$18,000** | **$61,800 (77%)** | + +### Deep Dive: Datadog Migration + +For comprehensive migration guidance including: +- Detailed cost comparison and ROI calculations +- Step-by-step migration instructions +- Infrastructure sizing recommendations (CPU, RAM, storage) +- Dashboard conversion tools and examples +- Alert rule translation patterns +- Application instrumentation changes (DogStatsD → Prometheus client) +- Python scripts for exporting Datadog dashboards and monitors +- Common challenges and solutions + +**→ Read**: [references/datadog_migration.md](references/datadog_migration.md) + +--- + +## 8. Tool Selection & Comparison + +### Decision Matrix + +**Choose Prometheus + Grafana if**: +- ✅ Using Kubernetes +- ✅ Want control and customization +- ✅ Have ops capacity +- ✅ Budget-conscious + +**Choose Datadog if**: +- ✅ Want ease of use +- ✅ Need full observability now +- ✅ Budget allows ($8k+/month for 100 hosts) + +**Choose Grafana Stack (LGTM) if**: +- ✅ Want open source full stack +- ✅ Cost-effective solution +- ✅ Cloud-native architecture + +**Choose ELK Stack if**: +- ✅ Heavy log analysis needs +- ✅ Need powerful search +- ✅ Have dedicated ops team + +**Choose Cloud Native (CloudWatch/etc) if**: +- ✅ Single cloud provider +- ✅ Simple needs +- ✅ Want minimal setup + +### Cost Comparison (100 hosts, 1TB logs/month) + +| Solution | Monthly Cost | Setup | Ops Burden | +|----------|-------------|--------|------------| +| Prometheus + Loki + Tempo | $1,500 | Medium | Medium | +| Grafana Cloud | $3,000 | Low | Low | +| Datadog | $8,000 | Low | None | +| ELK Stack | $4,000 | High | High | +| CloudWatch | $2,000 | Low | Low | + +### Deep Dive: Tool Comparison + +For comprehensive tool comparison including: +- Metrics platforms (Prometheus, Datadog, New Relic, CloudWatch, Grafana Cloud) +- Logging platforms (ELK, Loki, Splunk, CloudWatch Logs, Sumo Logic) +- Tracing platforms (Jaeger, Tempo, Datadog APM, X-Ray) +- Full-stack observability comparison +- Recommendations by company size + +**→ Read**: [references/tool_comparison.md](references/tool_comparison.md) + +--- + +## 9. Troubleshooting & Analysis + +### Health Check Validation + +Validate health check endpoints against best practices: + +```bash +# Check single endpoint +python3 scripts/health_check_validator.py https://api.example.com/health + +# Check multiple endpoints +python3 scripts/health_check_validator.py \ + https://api.example.com/health \ + https://api.example.com/readiness \ + --verbose +``` + +**Checks for**: +- ✓ Returns 200 status code +- ✓ Response time < 1 second +- ✓ Returns JSON format +- ✓ Contains 'status' field +- ✓ Includes version/build info +- ✓ Checks dependencies +- ✓ Disables caching + +**→ Script**: [scripts/health_check_validator.py](scripts/health_check_validator.py) + +### Common Troubleshooting Workflows + +**High Latency Investigation**: +1. Check dashboards for latency spike +2. Query traces for slow operations +3. Check database slow query log +4. Check external API response times +5. Review recent deployments +6. Check resource utilization (CPU, memory) + +**High Error Rate Investigation**: +1. Check error logs for patterns +2. Identify affected endpoints +3. Check dependency health +4. Review recent deployments +5. Check resource limits +6. Verify configuration + +**Service Down Investigation**: +1. Check if pods/instances are running +2. Check health check endpoint +3. Review recent deployments +4. Check resource availability +5. Check network connectivity +6. Review logs for startup errors + +--- + +## Quick Reference Commands + +### Prometheus Queries + +```promql +# Request rate +sum(rate(http_requests_total[5m])) + +# Error rate +sum(rate(http_requests_total{status=~"5.."}[5m])) + / +sum(rate(http_requests_total[5m])) * 100 + +# P95 latency +histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) +) + +# CPU usage +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Memory usage +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 +``` + +### Kubernetes Commands + +```bash +# Check pod status +kubectl get pods -n + +# View pod logs +kubectl logs -f -n + +# Check pod resources +kubectl top pods -n + +# Describe pod for events +kubectl describe pod -n + +# Check recent deployments +kubectl rollout history deployment/ -n +``` + +### Log Queries + +**Elasticsearch**: +```json +GET /logs-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "level": "error" } }, + { "range": { "@timestamp": { "gte": "now-1h" } } } + ] + } + } +} +``` + +**Loki (LogQL)**: +```logql +{job="app", level="error"} |= "error" | json +``` + +**CloudWatch Insights**: +``` +fields @timestamp, level, message +| filter level = "error" +| filter @timestamp > ago(1h) +``` + +--- + +## Resources Summary + +### Scripts (automation and analysis) +- `analyze_metrics.py` - Detect anomalies in Prometheus/CloudWatch metrics +- `alert_quality_checker.py` - Audit alert rules against best practices +- `slo_calculator.py` - Calculate SLO compliance and error budgets +- `log_analyzer.py` - Parse logs for errors and patterns +- `dashboard_generator.py` - Generate Grafana dashboards from templates +- `health_check_validator.py` - Validate health check endpoints +- `datadog_cost_analyzer.py` - Analyze Datadog usage and find cost waste + +### References (deep-dive documentation) +- `metrics_design.md` - Four Golden Signals, RED/USE methods, metric types +- `alerting_best_practices.md` - Alert design, runbooks, on-call practices +- `logging_guide.md` - Structured logging, aggregation patterns +- `tracing_guide.md` - OpenTelemetry, distributed tracing +- `slo_sla_guide.md` - SLI/SLO/SLA definitions, error budgets +- `tool_comparison.md` - Comprehensive comparison of monitoring tools +- `datadog_migration.md` - Complete guide for migrating from Datadog to OSS stack +- `dql_promql_translation.md` - Datadog Query Language to PromQL translation reference + +### Templates (ready-to-use configurations) +- `prometheus-alerts/webapp-alerts.yml` - Production-ready web app alerts +- `prometheus-alerts/kubernetes-alerts.yml` - Kubernetes monitoring alerts +- `otel-config/collector-config.yaml` - OpenTelemetry Collector configuration +- `runbooks/incident-runbook-template.md` - Incident response template + +--- + +## Best Practices + +### Metrics +- Start with Four Golden Signals +- Use appropriate metric types (counter, gauge, histogram) +- Keep cardinality low (avoid high-cardinality labels) +- Follow naming conventions + +### Logging +- Use structured logging (JSON) +- Include request IDs for tracing +- Set appropriate log levels +- Redact PII before logging + +### Alerting +- Make every alert actionable +- Alert on symptoms, not causes +- Use multi-window burn rate alerts +- Include runbook links + +### Tracing +- Sample appropriately (1-10% in production) +- Always record errors +- Use semantic conventions +- Propagate context between services + +### SLOs +- Start with current performance +- Set realistic targets +- Define error budget policies +- Review and adjust quarterly diff --git a/assets/templates/otel-config/collector-config.yaml b/assets/templates/otel-config/collector-config.yaml new file mode 100644 index 0000000..2a94676 --- /dev/null +++ b/assets/templates/otel-config/collector-config.yaml @@ -0,0 +1,227 @@ +# OpenTelemetry Collector Configuration +# Receives metrics, logs, and traces and exports to various backends + +receivers: + # OTLP receiver (standard OpenTelemetry protocol) + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + # Prometheus receiver (scrape Prometheus endpoints) + prometheus: + config: + scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 30s + static_configs: + - targets: ['localhost:8888'] + + # Host metrics (CPU, memory, disk, network) + hostmetrics: + collection_interval: 30s + scrapers: + cpu: + memory: + disk: + network: + filesystem: + load: + + # Kubernetes receiver (cluster metrics) + k8s_cluster: + auth_type: serviceAccount + node_conditions_to_report: [Ready, MemoryPressure, DiskPressure] + distribution: kubernetes + + # Zipkin receiver (legacy tracing) + zipkin: + endpoint: 0.0.0.0:9411 + +processors: + # Batch processor (improves performance) + batch: + timeout: 10s + send_batch_size: 1024 + send_batch_max_size: 2048 + + # Memory limiter (prevent OOM) + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + + # Resource processor (add resource attributes) + resource: + attributes: + - key: environment + value: production + action: insert + - key: cluster.name + value: prod-cluster + action: insert + + # Attributes processor (modify span/metric attributes) + attributes: + actions: + - key: http.url + action: delete # Remove potentially sensitive URLs + - key: db.statement + action: hash # Hash SQL queries for privacy + + # Filter processor (drop unwanted data) + filter: + metrics: + # Drop metrics matching criteria + exclude: + match_type: regexp + metric_names: + - ^go_.* # Drop Go runtime metrics + - ^process_.* # Drop process metrics + + # Tail sampling (intelligent trace sampling) + tail_sampling: + decision_wait: 10s + num_traces: 100 + policies: + # Always sample errors + - name: error-policy + type: status_code + status_code: + status_codes: [ERROR] + + # Sample slow traces + - name: latency-policy + type: latency + latency: + threshold_ms: 1000 + + # Sample 10% of others + - name: probabilistic-policy + type: probabilistic + probabilistic: + sampling_percentage: 10 + + # Span processor (modify spans) + span: + name: + to_attributes: + rules: + - ^\/api\/v1\/users\/(?P.*)$ + from_attributes: + - db.name + - http.method + +exporters: + # Prometheus exporter (expose metrics endpoint) + prometheus: + endpoint: 0.0.0.0:8889 + namespace: otel + + # OTLP exporters (send to backends) + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + otlp/mimir: + endpoint: mimir:4317 + tls: + insecure: true + + # Loki exporter (for logs) + loki: + endpoint: http://loki:3100/loki/api/v1/push + labels: + resource: + service.name: "service_name" + service.namespace: "service_namespace" + attributes: + level: "level" + + # Jaeger exporter (alternative tracing backend) + jaeger: + endpoint: jaeger:14250 + tls: + insecure: true + + # Elasticsearch exporter (for logs) + elasticsearch: + endpoints: + - http://elasticsearch:9200 + logs_index: otel-logs + traces_index: otel-traces + + # CloudWatch exporter (AWS) + awscloudwatch: + region: us-east-1 + namespace: MyApp + log_group_name: /aws/otel/logs + log_stream_name: otel-collector + + # Datadog exporter + datadog: + api: + key: ${DD_API_KEY} + site: datadoghq.com + + # File exporter (debugging) + file: + path: /tmp/otel-output.json + + # Logging exporter (console output for debugging) + logging: + verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 + +extensions: + # Health check endpoint + health_check: + endpoint: 0.0.0.0:13133 + + # Pprof endpoint (for profiling) + pprof: + endpoint: 0.0.0.0:1777 + + # ZPages (internal diagnostics) + zpages: + endpoint: 0.0.0.0:55679 + +service: + extensions: [health_check, pprof, zpages] + + pipelines: + # Traces pipeline + traces: + receivers: [otlp, zipkin] + processors: [memory_limiter, batch, tail_sampling, resource, span] + exporters: [otlp/tempo, jaeger, logging] + + # Metrics pipeline + metrics: + receivers: [otlp, prometheus, hostmetrics, k8s_cluster] + processors: [memory_limiter, batch, filter, resource] + exporters: [otlp/mimir, prometheus, awscloudwatch] + + # Logs pipeline + logs: + receivers: [otlp] + processors: [memory_limiter, batch, resource, attributes] + exporters: [loki, elasticsearch, awscloudwatch] + + # Telemetry (collector's own metrics) + telemetry: + logs: + level: info + metrics: + address: 0.0.0.0:8888 + +# Notes: +# 1. Replace ${DD_API_KEY} with actual API key or use environment variable +# 2. Adjust endpoints to match your infrastructure +# 3. Comment out exporters you don't use +# 4. Adjust sampling rates based on your volume and needs +# 5. Add TLS configuration for production deployments diff --git a/assets/templates/prometheus-alerts/kubernetes-alerts.yml b/assets/templates/prometheus-alerts/kubernetes-alerts.yml new file mode 100644 index 0000000..adac0c6 --- /dev/null +++ b/assets/templates/prometheus-alerts/kubernetes-alerts.yml @@ -0,0 +1,293 @@ +--- +# Prometheus Alert Rules for Kubernetes +# Covers pods, nodes, deployments, and resource usage + +groups: + - name: kubernetes_pods + interval: 30s + rules: + # Pod crash looping + - alert: PodCrashLooping + expr: | + rate(kube_pod_container_status_restarts_total[15m]) > 0 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Pod is crash looping - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes. + + Check pod logs: + kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} --previous + runbook_url: "https://runbooks.example.com/pod-crash-loop" + + # Pod not ready + - alert: PodNotReady + expr: | + sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Pod not ready - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Pod {{ $labels.namespace }}/{{ $labels.pod }} is in {{ $labels.phase }} state for 10 minutes. + + Investigate: + kubectl describe pod -n {{ $labels.namespace }} {{ $labels.pod }} + runbook_url: "https://runbooks.example.com/pod-not-ready" + + # Pod OOMKilled + - alert: PodOOMKilled + expr: | + sum by (namespace, pod) (kube_pod_container_status_terminated_reason{reason="OOMKilled"}) > 0 + for: 1m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Pod killed due to OOM - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Pod {{ $labels.namespace }}/{{ $labels.pod }} was killed due to out-of-memory. + + Increase memory limits or investigate memory leak. + runbook_url: "https://runbooks.example.com/oom-killed" + + - name: kubernetes_deployments + interval: 30s + rules: + # Deployment replica mismatch + - alert: DeploymentReplicasMismatch + expr: | + kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 15m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Deployment replicas mismatch - {{ $labels.namespace }}/{{ $labels.deployment }}" + description: | + Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has been running with + fewer replicas than desired for 15 minutes. + + Desired: {{ $value }} + Available: Check deployment status + runbook_url: "https://runbooks.example.com/replica-mismatch" + + # Deployment rollout stuck + - alert: DeploymentRolloutStuck + expr: | + kube_deployment_status_condition{condition="Progressing", status="false"} > 0 + for: 15m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Deployment rollout stuck - {{ $labels.namespace }}/{{ $labels.deployment }}" + description: | + Deployment {{ $labels.namespace }}/{{ $labels.deployment }} rollout is stuck. + + Check rollout status: + kubectl rollout status deployment/{{ $labels.deployment }} -n {{ $labels.namespace }} + runbook_url: "https://runbooks.example.com/rollout-stuck" + + - name: kubernetes_nodes + interval: 30s + rules: + # Node not ready + - alert: NodeNotReady + expr: | + kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: critical + team: platform + component: kubernetes + annotations: + summary: "Node not ready - {{ $labels.node }}" + description: | + Node {{ $labels.node }} has been NotReady for 5 minutes. + + This will affect pod scheduling and availability. + + Check node status: + kubectl describe node {{ $labels.node }} + runbook_url: "https://runbooks.example.com/node-not-ready" + + # Node memory pressure + - alert: NodeMemoryPressure + expr: | + kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Node under memory pressure - {{ $labels.node }}" + description: | + Node {{ $labels.node }} is experiencing memory pressure. + + Pods may be evicted. Consider scaling up or evicting low-priority pods. + runbook_url: "https://runbooks.example.com/memory-pressure" + + # Node disk pressure + - alert: NodeDiskPressure + expr: | + kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Node under disk pressure - {{ $labels.node }}" + description: | + Node {{ $labels.node }} is experiencing disk pressure. + + Clean up disk space or add capacity. + runbook_url: "https://runbooks.example.com/disk-pressure" + + # Node high CPU + - alert: NodeHighCPU + expr: | + (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) * 100 > 80 + for: 15m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Node high CPU usage - {{ $labels.instance }}" + description: | + Node {{ $labels.instance }} CPU usage is {{ $value | humanize }}%. + + Check for resource-intensive pods or scale cluster. + runbook_url: "https://runbooks.example.com/node-high-cpu" + + - name: kubernetes_resources + interval: 30s + rules: + # Container CPU throttling + - alert: ContainerCPUThrottling + expr: | + rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Container CPU throttling - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} + is being CPU throttled. + + CPU throttling rate: {{ $value | humanize }} + + Consider increasing CPU limits. + runbook_url: "https://runbooks.example.com/cpu-throttling" + + # Container memory usage high + - alert: ContainerMemoryUsageHigh + expr: | + (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Container memory usage high - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} + is using {{ $value | humanizePercentage }} of its memory limit. + + Risk of OOMKill. Consider increasing memory limits. + runbook_url: "https://runbooks.example.com/high-memory" + + - name: kubernetes_pv + interval: 30s + rules: + # PersistentVolume nearing full + - alert: PersistentVolumeFillingUp + expr: | + (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.15 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "PersistentVolume filling up - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}" + description: | + PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} + is {{ $value | humanizePercentage }} full. + + Available space is running low. Consider expanding volume. + runbook_url: "https://runbooks.example.com/pv-filling-up" + + # PersistentVolume critically full + - alert: PersistentVolumeCriticallyFull + expr: | + (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.05 + for: 5m + labels: + severity: critical + team: platform + component: kubernetes + annotations: + summary: "PersistentVolume critically full - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}" + description: | + PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} + is {{ $value | humanizePercentage }} full. + + Immediate action required to prevent application failures. + runbook_url: "https://runbooks.example.com/pv-critically-full" + + - name: kubernetes_jobs + interval: 30s + rules: + # Job failed + - alert: JobFailed + expr: | + kube_job_status_failed > 0 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Job failed - {{ $labels.namespace }}/{{ $labels.job_name }}" + description: | + Job {{ $labels.namespace }}/{{ $labels.job_name }} has failed. + + Check job logs: + kubectl logs job/{{ $labels.job_name }} -n {{ $labels.namespace }} + runbook_url: "https://runbooks.example.com/job-failed" + + # CronJob not running + - alert: CronJobNotRunning + expr: | + time() - kube_cronjob_status_last_schedule_time > 3600 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "CronJob not running - {{ $labels.namespace }}/{{ $labels.cronjob }}" + description: | + CronJob {{ $labels.namespace}}/{{ $labels.cronjob }} hasn't run in over an hour. + + Check CronJob status: + kubectl describe cronjob {{ $labels.cronjob }} -n {{ $labels.namespace }} + runbook_url: "https://runbooks.example.com/cronjob-not-running" diff --git a/assets/templates/prometheus-alerts/webapp-alerts.yml b/assets/templates/prometheus-alerts/webapp-alerts.yml new file mode 100644 index 0000000..f7e596e --- /dev/null +++ b/assets/templates/prometheus-alerts/webapp-alerts.yml @@ -0,0 +1,243 @@ +--- +# Prometheus Alert Rules for Web Applications +# Based on SLO best practices and multi-window burn rate alerting + +groups: + - name: webapp_availability + interval: 30s + rules: + # Fast burn rate alert (1h window) - SLO: 99.9% + - alert: ErrorBudgetFastBurn + expr: | + ( + sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h])) + / + sum(rate(http_requests_total{job="webapp"}[1h])) + ) > (14.4 * 0.001) + for: 2m + labels: + severity: critical + team: backend + component: webapp + annotations: + summary: "Fast error budget burn - {{ $labels.job }}" + description: | + Error rate is {{ $value | humanizePercentage }} over the last hour, + burning through error budget at 14.4x rate. + + At this rate, the monthly error budget will be exhausted in 2 days. + + Immediate investigation required. + runbook_url: "https://runbooks.example.com/error-budget-burn" + dashboard: "https://grafana.example.com/d/webapp" + + # Slow burn rate alert (6h window) + - alert: ErrorBudgetSlowBurn + expr: | + ( + sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h])) + / + sum(rate(http_requests_total{job="webapp"}[6h])) + ) > (6 * 0.001) + for: 30m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "Elevated error budget burn - {{ $labels.job }}" + description: | + Error rate is {{ $value | humanizePercentage }} over the last 6 hours, + burning through error budget at 6x rate. + + Monitor closely and investigate if trend continues. + runbook_url: "https://runbooks.example.com/error-budget-burn" + + # Service down alert + - alert: WebAppDown + expr: up{job="webapp"} == 0 + for: 2m + labels: + severity: critical + team: backend + component: webapp + annotations: + summary: "Web application is down - {{ $labels.instance }}" + description: | + Web application instance {{ $labels.instance }} has been down for 2 minutes. + + Check service health and logs immediately. + runbook_url: "https://runbooks.example.com/service-down" + + - name: webapp_latency + interval: 30s + rules: + # High latency (p95) + - alert: HighLatencyP95 + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le) + ) > 0.5 + for: 10m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "High p95 latency - {{ $labels.job }}" + description: | + P95 request latency is {{ $value }}s, exceeding 500ms threshold. + + This may impact user experience. Check for: + - Slow database queries + - External API issues + - Resource saturation + runbook_url: "https://runbooks.example.com/high-latency" + dashboard: "https://grafana.example.com/d/webapp-latency" + + # Very high latency (p99) + - alert: HighLatencyP99 + expr: | + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le) + ) > 2 + for: 5m + labels: + severity: critical + team: backend + component: webapp + annotations: + summary: "Critical latency degradation - {{ $labels.job }}" + description: | + P99 request latency is {{ $value }}s, exceeding 2s threshold. + + Severe performance degradation detected. + runbook_url: "https://runbooks.example.com/high-latency" + + - name: webapp_resources + interval: 30s + rules: + # High CPU + - alert: HighCPU + expr: | + rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80 + for: 15m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "High CPU usage - {{ $labels.instance }}" + description: | + CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}. + + Consider scaling up or investigating CPU-intensive operations. + runbook_url: "https://runbooks.example.com/high-cpu" + + # High memory + - alert: HighMemory + expr: | + (process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80 + for: 15m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "High memory usage - {{ $labels.instance }}" + description: | + Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}. + + Check for memory leaks or consider scaling up. + runbook_url: "https://runbooks.example.com/high-memory" + + - name: webapp_traffic + interval: 30s + rules: + # Traffic spike + - alert: TrafficSpike + expr: | + sum(rate(http_requests_total{job="webapp"}[5m])) + > + 1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h)) + for: 10m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "Traffic spike detected - {{ $labels.job }}" + description: | + Request rate increased by 50% compared to 1 hour ago. + + Current: {{ $value | humanize }} req/s + + This could be: + - Legitimate traffic increase + - DDoS attack + - Retry storm + + Monitor closely and be ready to scale. + runbook_url: "https://runbooks.example.com/traffic-spike" + + # Traffic drop (potential issue) + - alert: TrafficDrop + expr: | + sum(rate(http_requests_total{job="webapp"}[5m])) + < + 0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h)) + for: 10m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "Traffic drop detected - {{ $labels.job }}" + description: | + Request rate dropped by 50% compared to 1 hour ago. + + This could indicate: + - Upstream service issue + - DNS problems + - Load balancer misconfiguration + runbook_url: "https://runbooks.example.com/traffic-drop" + + - name: webapp_dependencies + interval: 30s + rules: + # Database connection pool exhaustion + - alert: DatabasePoolExhausted + expr: | + (db_connection_pool_active / db_connection_pool_max) > 0.9 + for: 5m + labels: + severity: critical + team: backend + component: database + annotations: + summary: "Database connection pool near exhaustion" + description: | + Connection pool is {{ $value | humanizePercentage }} full. + + This will cause request failures. Immediate action required. + runbook_url: "https://runbooks.example.com/db-pool-exhausted" + + # External API errors + - alert: ExternalAPIErrors + expr: | + sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api) + / + sum(rate(external_api_requests_total[5m])) by (api) + > 0.1 + for: 5m + labels: + severity: warning + team: backend + component: integration + annotations: + summary: "High error rate from external API - {{ $labels.api }}" + description: | + {{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate. + + Check API status page and consider enabling circuit breaker. + runbook_url: "https://runbooks.example.com/external-api-errors" diff --git a/assets/templates/runbooks/incident-runbook-template.md b/assets/templates/runbooks/incident-runbook-template.md new file mode 100644 index 0000000..59a0103 --- /dev/null +++ b/assets/templates/runbooks/incident-runbook-template.md @@ -0,0 +1,409 @@ +# Runbook: [Alert Name] + +## Overview + +**Alert Name**: [e.g., HighLatency, ServiceDown, ErrorBudgetBurn] + +**Severity**: [Critical | Warning | Info] + +**Team**: [e.g., Backend, Platform, Database] + +**Component**: [e.g., API Gateway, User Service, PostgreSQL] + +**What it means**: [One-line description of what this alert indicates] + +**User impact**: [How does this affect users? High/Medium/Low] + +**Urgency**: [How quickly must this be addressed? Immediate/Hours/Days] + +--- + +## Alert Details + +### When This Alert Fires + +This alert fires when: +- [Specific condition, e.g., "P95 latency exceeds 500ms for 10 minutes"] +- [Any additional conditions] + +### Symptoms + +Users will experience: +- [ ] Slow response times +- [ ] Errors or failures +- [ ] Service unavailable +- [ ] [Other symptoms] + +### Probable Causes + +Common causes include: +1. **[Cause 1]**: [Description] + - Example: Database overload due to slow queries +2. **[Cause 2]**: [Description] + - Example: Memory leak causing OOM errors +3. **[Cause 3]**: [Description] + - Example: Upstream service degradation + +--- + +## Investigation Steps + +### 1. Check Service Health + +**Dashboard**: [Link to primary dashboard] + +**Key metrics to check**: +```bash +# Request rate +sum(rate(http_requests_total[5m])) + +# Error rate +sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) + +# Latency (p95, p99) +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +**What to look for**: +- [ ] Has traffic spiked recently? +- [ ] Is error rate elevated? +- [ ] Are any endpoints particularly slow? + +### 2. Check Recent Changes + +**Deployments**: +```bash +# Kubernetes +kubectl rollout history deployment/[service-name] -n [namespace] + +# Check when last deployed +kubectl get pods -n [namespace] -o wide | grep [service-name] +``` + +**What to look for**: +- [ ] Was there a recent deployment? +- [ ] Did alert start after deployment? +- [ ] Any configuration changes? + +### 3. Check Logs + +**Log query** (adjust for your log system): +```bash +# Kubernetes +kubectl logs deployment/[service-name] -n [namespace] --tail=100 | grep ERROR + +# Elasticsearch/Kibana +GET /logs-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "service": "[service-name]" } }, + { "match": { "level": "error" } }, + { "range": { "@timestamp": { "gte": "now-30m" } } } + ] + } + } +} + +# Loki/LogQL +{job="[service-name]"} |= "error" | json | level="error" +``` + +**What to look for**: +- [ ] Repeated error messages +- [ ] Stack traces +- [ ] Connection errors +- [ ] Timeout errors + +### 4. Check Dependencies + +**Database**: +```bash +# Check active connections +SELECT count(*) FROM pg_stat_activity WHERE state = 'active'; + +# Check slow queries +SELECT pid, now() - pg_stat_activity.query_start AS duration, query +FROM pg_stat_activity +WHERE state = 'active' AND now() - pg_stat_activity.query_start > interval '5 seconds'; +``` + +**External APIs**: +- [ ] Check status pages: [Link to status pages] +- [ ] Check API error rates in dashboard +- [ ] Test API endpoints manually + +**Cache** (Redis/Memcached): +```bash +# Redis info +redis-cli -h [host] INFO stats + +# Check memory usage +redis-cli -h [host] INFO memory +``` + +### 5. Check Resource Usage + +**CPU and Memory**: +```bash +# Kubernetes +kubectl top pods -n [namespace] | grep [service-name] + +# Node metrics +kubectl top nodes +``` + +**Prometheus queries**: +```promql +# CPU usage by pod +sum(rate(container_cpu_usage_seconds_total{pod=~"[service-name].*"}[5m])) by (pod) + +# Memory usage by pod +sum(container_memory_usage_bytes{pod=~"[service-name].*"}) by (pod) +``` + +**What to look for**: +- [ ] CPU throttling +- [ ] Memory approaching limits +- [ ] Disk space issues + +### 6. Check Traces (if available) + +**Trace query**: +```bash +# Jaeger +# Search for slow traces (> 1s) in last 30 minutes + +# Tempo/TraceQL +{ duration > 1s && resource.service.name = "[service-name]" } +``` + +**What to look for**: +- [ ] Which operation is slow? +- [ ] Where is time spent? (DB, external API, service logic) +- [ ] Any N+1 query patterns? + +--- + +## Common Scenarios and Solutions + +### Scenario 1: Recent Deployment Caused Issue + +**Symptoms**: +- Alert started immediately after deployment +- Error logs correlate with new code + +**Solution**: +```bash +# Rollback deployment +kubectl rollout undo deployment/[service-name] -n [namespace] + +# Verify rollback succeeded +kubectl rollout status deployment/[service-name] -n [namespace] + +# Monitor for alert resolution +``` + +**Follow-up**: +- [ ] Create incident report +- [ ] Review deployment process +- [ ] Add pre-deployment checks + +### Scenario 2: Database Performance Issue + +**Symptoms**: +- Slow query logs show problematic queries +- Database CPU or connection pool exhausted + +**Solution**: +```bash +# Identify slow query +# Kill long-running query (use with caution) +SELECT pg_cancel_backend([pid]); + +# Or terminate if cancel doesn't work +SELECT pg_terminate_backend([pid]); + +# Add index if missing (in maintenance window) +CREATE INDEX CONCURRENTLY idx_name ON table_name (column_name); +``` + +**Follow-up**: +- [ ] Add query performance test +- [ ] Review and optimize query +- [ ] Consider read replicas + +### Scenario 3: Memory Leak + +**Symptoms**: +- Memory usage gradually increasing +- Eventually OOMKilled +- Restarts temporarily fix issue + +**Solution**: +```bash +# Immediate: Restart pods +kubectl rollout restart deployment/[service-name] -n [namespace] + +# Increase memory limits (temporary) +kubectl set resources deployment/[service-name] -n [namespace] \ + --limits=memory=2Gi +``` + +**Follow-up**: +- [ ] Profile application for memory leaks +- [ ] Add memory usage alerts +- [ ] Fix root cause + +### Scenario 4: Traffic Spike / DDoS + +**Symptoms**: +- Sudden traffic increase +- Traffic from unusual sources +- High CPU/memory across all instances + +**Solution**: +```bash +# Scale up immediately +kubectl scale deployment/[service-name] -n [namespace] --replicas=10 + +# Enable rate limiting at load balancer level +# (Specific steps depend on LB) + +# Block suspicious IPs if confirmed DDoS +# (Use WAF or network policies) +``` + +**Follow-up**: +- [ ] Implement rate limiting +- [ ] Add DDoS protection (CloudFlare, WAF) +- [ ] Set up auto-scaling + +### Scenario 5: Upstream Service Degradation + +**Symptoms**: +- Errors calling external API +- Timeouts to upstream service +- Upstream status page shows issues + +**Solution**: +```bash +# Enable circuit breaker (if available) +# Adjust timeout configuration +# Switch to backup service/cached data + +# Monitor external service +# Check status page: [Link] +``` + +**Follow-up**: +- [ ] Implement circuit breaker pattern +- [ ] Add fallback mechanisms +- [ ] Set up external service monitoring + +--- + +## Immediate Actions (< 5 minutes) + +These should be done first to mitigate impact: + +1. **[Action 1]**: [e.g., "Scale up service"] + ```bash + kubectl scale deployment/[service] --replicas=10 + ``` + +2. **[Action 2]**: [e.g., "Rollback deployment"] + ```bash + kubectl rollout undo deployment/[service] + ``` + +3. **[Action 3]**: [e.g., "Enable circuit breaker"] + +--- + +## Short-term Actions (< 30 minutes) + +After immediate mitigation: + +1. **[Action 1]**: [e.g., "Investigate root cause"] +2. **[Action 2]**: [e.g., "Optimize slow query"] +3. **[Action 3]**: [e.g., "Clear cache if stale"] + +--- + +## Long-term Actions (Post-Incident) + +Preventive measures: + +1. **[Action 1]**: [e.g., "Add circuit breaker"] +2. **[Action 2]**: [e.g., "Implement auto-scaling"] +3. **[Action 3]**: [e.g., "Add query performance tests"] +4. **[Action 4]**: [e.g., "Update alert thresholds"] + +--- + +## Escalation + +If issue persists after 30 minutes: + +**Escalation Path**: +1. **Primary oncall**: @[username] ([slack/email]) +2. **Team lead**: @[username] ([slack/email]) +3. **Engineering manager**: @[username] ([slack/email]) +4. **Incident commander**: @[username] ([slack/email]) + +**Communication**: +- **Slack channel**: #[incidents-channel] +- **Status page**: [Link] +- **Incident tracking**: [Link to incident management tool] + +--- + +## Related Runbooks + +- [Related Runbook 1] +- [Related Runbook 2] +- [Related Runbook 3] + +## Related Dashboards + +- [Main Service Dashboard] +- [Resource Usage Dashboard] +- [Dependency Dashboard] + +## Related Documentation + +- [Architecture Diagram] +- [Service Documentation] +- [API Documentation] + +--- + +## Recent Incidents + +| Date | Duration | Root Cause | Resolution | Ticket | +|------|----------|------------|------------|--------| +| 2024-10-15 | 23 min | Database pool exhausted | Increased pool size | INC-123 | +| 2024-09-30 | 45 min | Memory leak | Fixed code, restarted | INC-120 | + +--- + +## Runbook Metadata + +**Last Updated**: [Date] + +**Owner**: [Team name] + +**Reviewers**: [Names] + +**Next Review**: [Date] + +--- + +## Notes + +- This runbook should be reviewed quarterly +- Update after each incident to capture new learnings +- Keep investigation steps concise and actionable +- Include actual commands that can be copy-pasted diff --git a/monitoring-observability.skill b/monitoring-observability.skill new file mode 100644 index 0000000..1a0c75a Binary files /dev/null and b/monitoring-observability.skill differ diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..64b1c64 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,125 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:ahmedasmar/devops-claude-skills:monitoring-observability", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "9bb89b1ce889c2df6d7c3c2eedbd6d1301297561", + "treeHash": "9fd50a78a79b6d45553e3372bc2d5142f4c48ba4a945ca724356f89f9ce08825", + "generatedAt": "2025-11-28T10:13:03.403599Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "monitoring-observability", + "description": "Monitoring and observability strategy, metrics/logs/traces systems, SLOs/error budgets, Prometheus/Grafana/Loki, OpenTelemetry, and tool comparison", + "version": null + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "b18b6358cf31ab285b751916a5b2c670b5bc2c8748ef17216f2c9106e4997f8e" + }, + { + "path": "SKILL.md", + "sha256": "c02fcac42ed2d4d6fcda67a9f835000b1a1198734e4d8d18000546dda81402e4" + }, + { + "path": "monitoring-observability.skill", + "sha256": "c2c368577bb73885c887cc824b695fb3d36f4a77e74b2e25dcd7815c331a71c1" + }, + { + "path": "references/alerting_best_practices.md", + "sha256": "99cea7a40310b77a4fdff5543a0b1ee44189497508757bee0dc9ebbe11794a53" + }, + { + "path": "references/metrics_design.md", + "sha256": "6edc73473e9d3c2ac7e46a4d97576d356d177ed701a2468c5e21d528ff9c29d7" + }, + { + "path": "references/tracing_guide.md", + "sha256": "5e419d77a31d8b3ee5c16fb57e1fc6e3e16d31efb8f4a86dd756c7327a482fa0" + }, + { + "path": "references/dql_promql_translation.md", + "sha256": "47113e77b03d9ac70fc35121efd93cf5e17e031b878d27791403493b71058c5c" + }, + { + "path": "references/tool_comparison.md", + "sha256": "fd0fc7e4fc3641ca0ddc469a14fa1373457f5a4586fe4bc7ec23afe3de9f6171" + }, + { + "path": "references/datadog_migration.md", + "sha256": "9ed5e276eb2ea67f72c91e1bb53374b293e164fa28c4c44f31ee9f8660dfaf02" + }, + { + "path": "references/logging_guide.md", + "sha256": "2c94b61d6db2c0f6b8927c8092010f3a2f1ea20d2eefd330d8073e7b4bcf4c9d" + }, + { + "path": "references/slo_sla_guide.md", + "sha256": "2a0cb69dd120897183f7bcab002a368dbe11bd5038817906da3391ca168e0052" + }, + { + "path": "scripts/log_analyzer.py", + "sha256": "c7fb7e13c2d6507c81ee9575fc8514408d36b2f2e786caeb536ba927d517046e" + }, + { + "path": "scripts/analyze_metrics.py", + "sha256": "50ad856cb043dfd70b60c6ca685b526d34b8bc5e5454dd0b530033da3da22545" + }, + { + "path": "scripts/health_check_validator.py", + "sha256": "cef8c447fabf83dfd9bd28a8d22127b87b66aafa4d151cbccd9fe1f1db0bbcf2" + }, + { + "path": "scripts/alert_quality_checker.py", + "sha256": "b561cf9c41e2de8d5f09557c018110553047d0ad54629bdc7a07a654d76263d1" + }, + { + "path": "scripts/datadog_cost_analyzer.py", + "sha256": "05a1c6c0033b04f2f5206af015907f2df4c9cf57f4c2b8f10ba2565236a5c97f" + }, + { + "path": "scripts/slo_calculator.py", + "sha256": "c26ab0f0a31e5efa830a9f24938ec356bfaef927438bd47b95f4ad0015cff662" + }, + { + "path": "scripts/dashboard_generator.py", + "sha256": "6fe98a49ae431d67bc44eb631c542ba29199da72cc348e90ec99d73a05783ee5" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "7b6a16e6bce66bf87929c2f3c4ea32f4bfadd8d9606edd195f144c82ec85f151" + }, + { + "path": "assets/templates/prometheus-alerts/webapp-alerts.yml", + "sha256": "d881081e53650c335ec5cc7d5d96bade03e607e55bff3bcbafe6811377055154" + }, + { + "path": "assets/templates/prometheus-alerts/kubernetes-alerts.yml", + "sha256": "cb8c247b245ea1fb2a904f525fce8f74f9237d79eda04c2c60938135a7271415" + }, + { + "path": "assets/templates/runbooks/incident-runbook-template.md", + "sha256": "1a5ba8951cf5b1408ea2101232ffe8d88fab75ed4ae63b0c9f1902059373112d" + }, + { + "path": "assets/templates/otel-config/collector-config.yaml", + "sha256": "2696548b1c7f4034283cc2387f9730efa4811881d1c9c9219002e7affc8c29f2" + } + ], + "dirSha256": "9fd50a78a79b6d45553e3372bc2d5142f4c48ba4a945ca724356f89f9ce08825" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/alerting_best_practices.md b/references/alerting_best_practices.md new file mode 100644 index 0000000..e4ce496 --- /dev/null +++ b/references/alerting_best_practices.md @@ -0,0 +1,609 @@ +# Alerting Best Practices + +## Core Principles + +### 1. Every Alert Should Be Actionable +If you can't do something about it, don't alert on it. + +❌ Bad: `Alert: CPU > 50%` (What action should be taken?) +✅ Good: `Alert: API latency p95 > 2s for 10m` (Investigate/scale up) + +### 2. Alert on Symptoms, Not Causes +Alert on what users experience, not underlying components. + +❌ Bad: `Database connection pool 80% full` +✅ Good: `Request latency p95 > 1s` (which might be caused by DB pool) + +### 3. Alert on SLO Violations +Tie alerts to Service Level Objectives. + +✅ `Error rate exceeds 0.1% (SLO: 99.9% availability)` + +### 4. Reduce Noise +Alert fatigue is real. Only page for critical issues. + +**Alert Severity Levels**: +- **Critical**: Page on-call immediately (user-facing issue) +- **Warning**: Create ticket, review during business hours +- **Info**: Log for awareness, no action needed + +--- + +## Alert Design Patterns + +### Pattern 1: Multi-Window Multi-Burn-Rate + +Google's recommended SLO alerting approach. + +**Concept**: Alert when error budget burn rate is high enough to exhaust the budget too quickly. + +```yaml +# Fast burn (6% of budget in 1 hour) +- alert: FastBurnRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[1h])) + / + sum(rate(http_requests_total[1h])) + > (14.4 * 0.001) # 14.4x burn rate for 99.9% SLO + for: 2m + labels: + severity: critical + +# Slow burn (6% of budget in 6 hours) +- alert: SlowBurnRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[6h])) + / + sum(rate(http_requests_total[6h])) + > (6 * 0.001) # 6x burn rate for 99.9% SLO + for: 30m + labels: + severity: warning +``` + +**Burn Rate Multipliers for 99.9% SLO (0.1% error budget)**: +- 1 hour window, 2m grace: 14.4x burn rate +- 6 hour window, 30m grace: 6x burn rate +- 3 day window, 6h grace: 1x burn rate + +### Pattern 2: Rate of Change +Alert when metrics change rapidly. + +```yaml +- alert: TrafficSpike + expr: | + sum(rate(http_requests_total[5m])) + > + 1.5 * sum(rate(http_requests_total[5m] offset 1h)) + for: 10m + annotations: + summary: "Traffic increased by 50% compared to 1 hour ago" +``` + +### Pattern 3: Threshold with Hysteresis +Prevent flapping with different thresholds for firing and resolving. + +```yaml +# Fire at 90%, resolve at 70% +- alert: HighCPU + expr: cpu_usage > 90 + for: 5m + +- alert: HighCPU_Resolved + expr: cpu_usage < 70 + for: 5m +``` + +### Pattern 4: Absent Metrics +Alert when expected metrics stop being reported (service down). + +```yaml +- alert: ServiceDown + expr: absent(up{job="my-service"}) + for: 5m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is not reporting metrics" +``` + +### Pattern 5: Aggregate Alerts +Alert on aggregate performance across multiple instances. + +```yaml +- alert: HighOverallErrorRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[5m])) + / + sum(rate(http_requests_total[5m])) + > 0.05 + for: 10m + annotations: + summary: "Overall error rate is {{ $value | humanizePercentage }}" +``` + +--- + +## Alert Annotation Best Practices + +### Required Fields + +**summary**: One-line description of the issue +```yaml +summary: "High error rate on {{ $labels.service }}: {{ $value | humanizePercentage }}" +``` + +**description**: Detailed explanation with context +```yaml +description: | + Error rate on {{ $labels.service }} is {{ $value | humanizePercentage }}, + which exceeds the threshold of 1% for more than 10 minutes. + + Current value: {{ $value }} + Runbook: https://runbooks.example.com/high-error-rate +``` + +**runbook_url**: Link to investigation steps +```yaml +runbook_url: "https://runbooks.example.com/alerts/{{ $labels.alertname }}" +``` + +### Optional but Recommended + +**dashboard**: Link to relevant dashboard +```yaml +dashboard: "https://grafana.example.com/d/service-dashboard?var-service={{ $labels.service }}" +``` + +**logs**: Link to logs +```yaml +logs: "https://kibana.example.com/app/discover#/?_a=(query:(query_string:(query:'service:{{ $labels.service }}')))" +``` + +--- + +## Alert Label Best Practices + +### Required Labels + +**severity**: Critical, warning, or info +```yaml +labels: + severity: critical +``` + +**team**: Who should handle this alert +```yaml +labels: + team: platform + severity: critical +``` + +**component**: What part of the system +```yaml +labels: + component: api-gateway + severity: warning +``` + +### Example Complete Alert +```yaml +- alert: HighLatency + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service) + ) > 1 + for: 10m + labels: + severity: warning + team: backend + component: api + environment: "{{ $labels.environment }}" + annotations: + summary: "High latency on {{ $labels.service }}" + description: | + P95 latency on {{ $labels.service }} is {{ $value }}s, exceeding 1s threshold. + + This may impact user experience. Check recent deployments and database performance. + + Current p95: {{ $value }}s + Threshold: 1s + Duration: 10m+ + runbook_url: "https://runbooks.example.com/high-latency" + dashboard: "https://grafana.example.com/d/api-dashboard" + logs: "https://kibana.example.com/app/discover#/?_a=(query:(query_string:(query:'service:{{ $labels.service }} AND level:error')))" +``` + +--- + +## Alert Thresholds + +### General Guidelines + +**Response Time / Latency**: +- Warning: p95 > 500ms or p99 > 1s +- Critical: p95 > 2s or p99 > 5s + +**Error Rate**: +- Warning: > 1% +- Critical: > 5% + +**Availability**: +- Warning: < 99.9% +- Critical: < 99.5% + +**CPU Utilization**: +- Warning: > 70% for 15m +- Critical: > 90% for 5m + +**Memory Utilization**: +- Warning: > 80% for 15m +- Critical: > 95% for 5m + +**Disk Space**: +- Warning: > 80% full +- Critical: > 90% full + +**Queue Depth**: +- Warning: > 70% of max capacity +- Critical: > 90% of max capacity + +### Application-Specific Thresholds + +Set thresholds based on: +1. **Historical performance**: Use p95 of last 30 days + 20% +2. **SLO requirements**: If SLO is 99.9%, alert at 99.5% +3. **Business impact**: What error rate causes user complaints? + +--- + +## The "for" Clause + +Prevent alert flapping by requiring the condition to be true for a duration. + +### Guidelines + +**Critical alerts**: Short duration (2-5m) +```yaml +- alert: ServiceDown + expr: up == 0 + for: 2m # Quick detection for critical issues +``` + +**Warning alerts**: Longer duration (10-30m) +```yaml +- alert: HighMemoryUsage + expr: memory_usage > 80 + for: 15m # Avoid noise from temporary spikes +``` + +**Resource saturation**: Medium duration (5-10m) +```yaml +- alert: HighCPU + expr: cpu_usage > 90 + for: 5m +``` + +--- + +## Alert Routing + +### Severity-Based Routing + +```yaml +# alertmanager.yml +route: + group_by: ['alertname', 'cluster'] + group_wait: 10s + group_interval: 5m + repeat_interval: 4h + receiver: 'default' + + routes: + # Critical alerts → PagerDuty + - match: + severity: critical + receiver: pagerduty + group_wait: 10s + repeat_interval: 5m + + # Warning alerts → Slack + - match: + severity: warning + receiver: slack + group_wait: 30s + repeat_interval: 12h + + # Info alerts → Email + - match: + severity: info + receiver: email + repeat_interval: 24h +``` + +### Team-Based Routing + +```yaml +routes: + # Platform team + - match: + team: platform + receiver: platform-pagerduty + + # Backend team + - match: + team: backend + receiver: backend-slack + + # Database team + - match: + component: database + receiver: dba-pagerduty +``` + +### Time-Based Routing + +```yaml +# Only page during business hours for non-critical +routes: + - match: + severity: warning + receiver: slack + active_time_intervals: + - business_hours + +time_intervals: + - name: business_hours + time_intervals: + - weekdays: ['monday:friday'] + times: + - start_time: '09:00' + end_time: '17:00' + location: 'America/New_York' +``` + +--- + +## Alert Grouping + +### Intelligent Grouping + +**Group by service and environment**: +```yaml +route: + group_by: ['alertname', 'service', 'environment'] + group_wait: 30s + group_interval: 5m +``` + +This prevents: +- 50 alerts for "HighCPU" on different pods → 1 grouped alert +- Mixing production and staging alerts + +### Inhibition Rules + +Suppress related alerts when a parent alert fires. + +```yaml +inhibit_rules: + # If service is down, suppress latency alerts + - source_match: + alertname: ServiceDown + target_match: + alertname: HighLatency + equal: ['service'] + + # If node is down, suppress all pod alerts on that node + - source_match: + alertname: NodeDown + target_match_re: + alertname: '(PodCrashLoop|HighCPU|HighMemory)' + equal: ['node'] +``` + +--- + +## Runbook Structure + +Every alert should link to a runbook with: + +### 1. Context +- What does this alert mean? +- What is the user impact? +- What is the urgency? + +### 2. Investigation Steps +```markdown +## Investigation + +1. Check service health dashboard + https://grafana.example.com/d/service-dashboard + +2. Check recent deployments + kubectl rollout history deployment/myapp -n production + +3. Check error logs + kubectl logs deployment/myapp -n production --tail=100 | grep ERROR + +4. Check dependencies + - Database: Check slow query log + - Redis: Check memory usage + - External APIs: Check status pages +``` + +### 3. Common Causes +```markdown +## Common Causes + +- **Recent deployment**: Check if alert started after deployment +- **Traffic spike**: Check request rate, might need to scale +- **Database issues**: Check query performance and connection pool +- **External API degradation**: Check third-party status pages +``` + +### 4. Resolution Steps +```markdown +## Resolution + +### Immediate Actions (< 5 minutes) +1. Scale up if traffic spike: `kubectl scale deployment myapp --replicas=10` +2. Rollback if recent deployment: `kubectl rollout undo deployment/myapp` + +### Short-term Actions (< 30 minutes) +1. Restart pods if memory leak: `kubectl rollout restart deployment/myapp` +2. Clear cache if stale data: `redis-cli -h cache.example.com FLUSHDB` + +### Long-term Actions (post-incident) +1. Review and optimize slow queries +2. Implement circuit breakers +3. Add more capacity +4. Update alert thresholds if false positive +``` + +### 5. Escalation +```markdown +## Escalation + +If issue persists after 30 minutes: +- Slack: #backend-oncall +- PagerDuty: Escalate to senior engineer +- Incident Commander: Jane Doe (jane@example.com) +``` + +--- + +## Anti-Patterns to Avoid + +### 1. Alert on Everything +❌ Don't: Alert on every warning log +✅ Do: Alert on error rate exceeding threshold + +### 2. Alert Without Context +❌ Don't: "Error rate high" +✅ Do: "Error rate 5.2% exceeds 1% threshold for 10m, impacting checkout flow" + +### 3. Static Thresholds for Dynamic Systems +❌ Don't: `cpu_usage > 70` (fails during scale-up) +✅ Do: Alert on SLO violations or rate of change + +### 4. No "for" Clause +❌ Don't: Alert immediately on threshold breach +✅ Do: Use `for: 5m` to avoid flapping + +### 5. Too Many Recipients +❌ Don't: Page 10 people for every alert +✅ Do: Route to specific on-call rotation + +### 6. Duplicate Alerts +❌ Don't: Alert on both cause and symptom +✅ Do: Alert on symptom, use inhibition for causes + +### 7. No Runbook +❌ Don't: Alert without guidance +✅ Do: Include runbook_url in every alert + +--- + +## Alert Testing + +### Test Alert Firing +```bash +# Trigger test alert in Prometheus +amtool alert add alertname="TestAlert" \ + severity="warning" \ + summary="Test alert" + +# Or use Alertmanager API +curl -X POST http://alertmanager:9093/api/v1/alerts \ + -d '[{ + "labels": {"alertname": "TestAlert", "severity": "critical"}, + "annotations": {"summary": "Test critical alert"} + }]' +``` + +### Verify Alert Rules +```bash +# Check syntax +promtool check rules alerts.yml + +# Test expression +promtool query instant http://prometheus:9090 \ + 'sum(rate(http_requests_total{status=~"5.."}[5m]))' + +# Unit test alerts +promtool test rules test.yml +``` + +### Test Alertmanager Routing +```bash +# Test which receiver an alert would go to +amtool config routes test \ + --config.file=alertmanager.yml \ + alertname="HighLatency" \ + severity="critical" \ + team="backend" +``` + +--- + +## On-Call Best Practices + +### Rotation Schedule +- **Primary on-call**: First responder +- **Secondary on-call**: Escalation backup +- **Rotation length**: 1 week (balance load vs context) +- **Handoff**: Monday morning (not Friday evening) + +### On-Call Checklist +```markdown +## Pre-shift +- [ ] Test pager/phone +- [ ] Review recent incidents +- [ ] Check upcoming deployments +- [ ] Update contact info + +## During shift +- [ ] Respond to pages within 5 minutes +- [ ] Document all incidents +- [ ] Update runbooks if gaps found +- [ ] Communicate in #incidents channel + +## Post-shift +- [ ] Hand off open incidents +- [ ] Complete incident reports +- [ ] Suggest improvements +- [ ] Update team documentation +``` + +### Escalation Policy +1. **Primary**: Responds within 5 minutes +2. **Secondary**: Auto-escalate after 15 minutes +3. **Manager**: Auto-escalate after 30 minutes +4. **Incident Commander**: Critical incidents only + +--- + +## Metrics About Alerts + +Monitor your monitoring system! + +### Key Metrics +```promql +# Alert firing frequency +sum(ALERTS{alertstate="firing"}) by (alertname) + +# Alert duration +ALERTS_FOR_STATE{alertstate="firing"} + +# Alerts per severity +sum(ALERTS{alertstate="firing"}) by (severity) + +# Time to acknowledge (from PagerDuty/etc) +pagerduty_incident_ack_duration_seconds +``` + +### Alert Quality Metrics +- **Mean Time to Acknowledge (MTTA)**: < 5 minutes +- **Mean Time to Resolve (MTTR)**: < 30 minutes +- **False Positive Rate**: < 10% +- **Alert Coverage**: % of incidents with preceding alert > 80% diff --git a/references/datadog_migration.md b/references/datadog_migration.md new file mode 100644 index 0000000..27fa5b5 --- /dev/null +++ b/references/datadog_migration.md @@ -0,0 +1,649 @@ +# Migrating from Datadog to Open-Source Stack + +## Overview + +This guide helps you migrate from Datadog to a cost-effective open-source observability stack: +- **Metrics**: Datadog → Prometheus + Grafana +- **Logs**: Datadog → Loki + Grafana +- **Traces**: Datadog APM → Tempo/Jaeger + Grafana +- **Dashboards**: Datadog → Grafana +- **Alerts**: Datadog Monitors → Prometheus Alertmanager + +**Estimated Cost Savings**: 60-80% for similar functionality + +--- + +## Cost Comparison + +### Example: 100-host infrastructure + +**Datadog**: +- Infrastructure Pro: $1,500/month (100 hosts × $15) +- Custom Metrics: $50/month (5,000 extra metrics beyond included 10,000) +- Logs: $2,000/month (20GB/day × $0.10/GB × 30 days) +- APM: $3,100/month (100 hosts × $31) +- **Total**: ~$6,650/month ($79,800/year) + +**Open-Source Stack** (self-hosted): +- Infrastructure: $1,200/month (EC2/GKE for Prometheus, Grafana, Loki, Tempo) +- Storage: $300/month (S3/GCS for long-term metrics and traces) +- Operations time: Variable +- **Total**: ~$1,500-2,500/month ($18,000-30,000/year) + +**Savings**: $49,800-61,800/year + +--- + +## Migration Strategy + +### Phase 1: Run Parallel (Month 1-2) +- Deploy open-source stack alongside Datadog +- Migrate metrics first (lowest risk) +- Validate data accuracy +- Build confidence + +### Phase 2: Migrate Dashboards & Alerts (Month 2-3) +- Convert Datadog dashboards to Grafana +- Translate alert rules +- Train team on new tools + +### Phase 3: Migrate Logs & Traces (Month 3-4) +- Set up Loki for log aggregation +- Deploy Tempo/Jaeger for tracing +- Update application instrumentation + +### Phase 4: Decommission Datadog (Month 4-5) +- Confirm all functionality migrated +- Cancel Datadog subscription +- Archive Datadog dashboards/alerts for reference + +--- + +## 1. Metrics Migration (Datadog → Prometheus) + +### Step 1: Deploy Prometheus + +**Kubernetes** (recommended): +```yaml +# prometheus-values.yaml +prometheus: + prometheusSpec: + retention: 30d + storageSpec: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 100Gi + + # Scrape configs + additionalScrapeConfigs: + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod +``` + +**Install**: +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm install prometheus prometheus-community/kube-prometheus-stack -f prometheus-values.yaml +``` + +**Docker Compose**: +```yaml +version: '3' +services: + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=30d' + +volumes: + prometheus-data: +``` + +### Step 2: Replace DogStatsD with Prometheus Exporters + +**Before (DogStatsD)**: +```python +from datadog import statsd + +statsd.increment('page.views') +statsd.histogram('request.duration', 0.5) +statsd.gauge('active_users', 100) +``` + +**After (Prometheus Python client)**: +```python +from prometheus_client import Counter, Histogram, Gauge + +page_views = Counter('page_views_total', 'Page views') +request_duration = Histogram('request_duration_seconds', 'Request duration') +active_users = Gauge('active_users', 'Active users') + +# Usage +page_views.inc() +request_duration.observe(0.5) +active_users.set(100) +``` + +### Step 3: Metric Name Translation + +| Datadog Metric | Prometheus Equivalent | +|----------------|----------------------| +| `system.cpu.idle` | `node_cpu_seconds_total{mode="idle"}` | +| `system.mem.free` | `node_memory_MemFree_bytes` | +| `system.disk.used` | `node_filesystem_size_bytes - node_filesystem_free_bytes` | +| `docker.cpu.usage` | `container_cpu_usage_seconds_total` | +| `kubernetes.pods.running` | `kube_pod_status_phase{phase="Running"}` | + +### Step 4: Export Existing Datadog Metrics (Optional) + +Use Datadog API to export historical data: + +```python +from datadog import api, initialize + +options = { + 'api_key': 'YOUR_API_KEY', + 'app_key': 'YOUR_APP_KEY' +} +initialize(**options) + +# Query metric +result = api.Metric.query( + start=int(time.time() - 86400), # Last 24h + end=int(time.time()), + query='avg:system.cpu.user{*}' +) + +# Convert to Prometheus format and import +``` + +--- + +## 2. Dashboard Migration (Datadog → Grafana) + +### Step 1: Export Datadog Dashboards + +```python +import requests +import json + +api_key = "YOUR_API_KEY" +app_key = "YOUR_APP_KEY" + +headers = { + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key +} + +# Get all dashboards +response = requests.get( + 'https://api.datadoghq.com/api/v1/dashboard', + headers=headers +) + +dashboards = response.json() + +# Export each dashboard +for dashboard in dashboards['dashboards']: + dash_id = dashboard['id'] + detail = requests.get( + f'https://api.datadoghq.com/api/v1/dashboard/{dash_id}', + headers=headers + ).json() + + with open(f'datadog_{dash_id}.json', 'w') as f: + json.dump(detail, f, indent=2) +``` + +### Step 2: Convert to Grafana Format + +**Manual Conversion Template**: + +| Datadog Widget | Grafana Panel Type | +|----------------|-------------------| +| Timeseries | Graph / Time series | +| Query Value | Stat | +| Toplist | Table / Bar gauge | +| Heatmap | Heatmap | +| Distribution | Histogram | + +**Automated Conversion** (basic example): +```python +def convert_datadog_to_grafana(datadog_dashboard): + grafana_dashboard = { + "title": datadog_dashboard['title'], + "panels": [] + } + + for widget in datadog_dashboard['widgets']: + panel = { + "title": widget['definition'].get('title', ''), + "type": map_widget_type(widget['definition']['type']), + "targets": convert_queries(widget['definition']['requests']) + } + grafana_dashboard['panels'].append(panel) + + return grafana_dashboard +``` + +### Step 3: Common Query Translations + +See `dql_promql_translation.md` for comprehensive query mappings. + +**Example conversions**: + +``` +Datadog: avg:system.cpu.user{*} +Prometheus: avg(rate(node_cpu_seconds_total{mode="user"}[5m])) * 100 + +Datadog: sum:requests.count{status:200}.as_rate() +Prometheus: sum(rate(http_requests_total{status="200"}[5m])) + +Datadog: p95:request.duration{*} +Prometheus: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +## 3. Alert Migration (Datadog Monitors → Prometheus Alerts) + +### Step 1: Export Datadog Monitors + +```python +import requests + +api_key = "YOUR_API_KEY" +app_key = "YOUR_APP_KEY" + +headers = { + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key +} + +response = requests.get( + 'https://api.datadoghq.com/api/v1/monitor', + headers=headers +) + +monitors = response.json() + +# Save each monitor +for monitor in monitors: + with open(f'monitor_{monitor["id"]}.json', 'w') as f: + json.dump(monitor, f, indent=2) +``` + +### Step 2: Convert to Prometheus Alert Rules + +**Datadog Monitor**: +```json +{ + "name": "High CPU Usage", + "type": "metric alert", + "query": "avg(last_5m):avg:system.cpu.user{*} > 80", + "message": "CPU usage is high on {{host.name}}" +} +``` + +**Prometheus Alert**: +```yaml +groups: + - name: infrastructure + rules: + - alert: HighCPUUsage + expr: | + 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is {{ $value }}%" +``` + +### Step 3: Alert Routing (Datadog → Alertmanager) + +**Datadog notification channels** → **Alertmanager receivers** + +```yaml +# alertmanager.yml +route: + group_by: ['alertname', 'severity'] + receiver: 'slack-notifications' + +receivers: + - name: 'slack-notifications' + slack_configs: + - api_url: 'YOUR_SLACK_WEBHOOK' + channel: '#alerts' + + - name: 'pagerduty-critical' + pagerduty_configs: + - service_key: 'YOUR_PAGERDUTY_KEY' +``` + +--- + +## 4. Log Migration (Datadog → Loki) + +### Step 1: Deploy Loki + +**Kubernetes**: +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm install loki grafana/loki-stack \ + --set loki.persistence.enabled=true \ + --set loki.persistence.size=100Gi \ + --set promtail.enabled=true +``` + +**Docker Compose**: +```yaml +version: '3' +services: + loki: + image: grafana/loki:latest + ports: + - "3100:3100" + volumes: + - ./loki-config.yaml:/etc/loki/local-config.yaml + - loki-data:/loki + + promtail: + image: grafana/promtail:latest + volumes: + - /var/log:/var/log + - ./promtail-config.yaml:/etc/promtail/config.yml + +volumes: + loki-data: +``` + +### Step 2: Replace Datadog Log Forwarder + +**Before (Datadog Agent)**: +```yaml +# datadog.yaml +logs_enabled: true + +logs_config: + container_collect_all: true +``` + +**After (Promtail)**: +```yaml +# promtail-config.yaml +server: + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: system + static_configs: + - targets: + - localhost + labels: + job: varlogs + __path__: /var/log/*.log +``` + +### Step 3: Query Translation + +**Datadog Logs Query**: +``` +service:my-app status:error +``` + +**Loki LogQL**: +```logql +{job="my-app", level="error"} +``` + +**More examples**: +``` +Datadog: service:api-gateway status:error @http.status_code:>=500 +Loki: {service="api-gateway", level="error"} | json | http_status_code >= 500 + +Datadog: source:nginx "404" +Loki: {source="nginx"} |= "404" +``` + +--- + +## 5. APM Migration (Datadog APM → Tempo/Jaeger) + +### Step 1: Choose Tracing Backend + +- **Tempo**: Better for high volume, cheaper storage (object storage) +- **Jaeger**: More mature, better UI, requires separate storage + +### Step 2: Replace Datadog Tracer with OpenTelemetry + +**Before (Datadog Python)**: +```python +from ddtrace import tracer + +@tracer.wrap() +def my_function(): + pass +``` + +**After (OpenTelemetry)**: +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +# Setup +trace.set_tracer_provider(TracerProvider()) +tracer = trace.get_tracer(__name__) +exporter = OTLPSpanExporter(endpoint="tempo:4317") + +@tracer.start_as_current_span("my_function") +def my_function(): + pass +``` + +### Step 3: Deploy Tempo + +```yaml +# tempo.yaml +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +storage: + trace: + backend: s3 + s3: + bucket: tempo-traces + endpoint: s3.amazonaws.com +``` + +--- + +## 6. Infrastructure Migration + +### Recommended Architecture + +``` +┌─────────────────────────────────────────┐ +│ Grafana (Visualization) │ +│ - Dashboards │ +│ - Unified view │ +└─────────────────────────────────────────┘ + ↓ ↓ ↓ +┌──────────────┐ ┌──────────┐ ┌──────────┐ +│ Prometheus │ │ Loki │ │ Tempo │ +│ (Metrics) │ │ (Logs) │ │ (Traces) │ +└──────────────┘ └──────────┘ └──────────┘ + ↓ ↓ ↓ +┌─────────────────────────────────────────┐ +│ Applications (OpenTelemetry) │ +└─────────────────────────────────────────┘ +``` + +### Sizing Recommendations + +**100-host environment**: + +- **Prometheus**: 2-4 CPU, 8-16GB RAM, 100GB SSD +- **Grafana**: 1 CPU, 2GB RAM +- **Loki**: 2-4 CPU, 8GB RAM, 100GB SSD +- **Tempo**: 2-4 CPU, 8GB RAM, S3 for storage +- **Alertmanager**: 1 CPU, 1GB RAM + +**Total**: ~8-16 CPU, 32-64GB RAM, 200GB SSD + object storage + +--- + +## 7. Migration Checklist + +### Pre-Migration +- [ ] Calculate current Datadog costs +- [ ] Identify all Datadog integrations +- [ ] Export all dashboards +- [ ] Export all monitors +- [ ] Document custom metrics +- [ ] Get stakeholder approval + +### During Migration +- [ ] Deploy Prometheus + Grafana +- [ ] Deploy Loki + Promtail +- [ ] Deploy Tempo/Jaeger (if using APM) +- [ ] Migrate metrics instrumentation +- [ ] Convert dashboards (top 10 critical first) +- [ ] Convert alerts (critical alerts first) +- [ ] Update application logging +- [ ] Replace APM instrumentation +- [ ] Run parallel for 2-4 weeks +- [ ] Validate data accuracy +- [ ] Train team on new tools + +### Post-Migration +- [ ] Decommission Datadog agent from all hosts +- [ ] Cancel Datadog subscription +- [ ] Archive Datadog configs +- [ ] Document new workflows +- [ ] Create runbooks for common tasks + +--- + +## 8. Common Challenges & Solutions + +### Challenge: Missing Datadog Features + +**Datadog Synthetic Monitoring**: +- Solution: Use **Blackbox Exporter** (Prometheus) or **Grafana Synthetic Monitoring** + +**Datadog Network Performance Monitoring**: +- Solution: Use **Cilium Hubble** (Kubernetes) or **eBPF-based tools** + +**Datadog RUM (Real User Monitoring)**: +- Solution: Use **Grafana Faro** or **OpenTelemetry Browser SDK** + +### Challenge: Team Learning Curve + +**Solution**: +- Provide training sessions (2-3 hours per tool) +- Create internal documentation with examples +- Set up sandbox environment for practice +- Assign champions for each tool + +### Challenge: Query Performance + +**Prometheus too slow**: +- Use **Thanos** or **Cortex** for scaling +- Implement recording rules for expensive queries +- Increase retention only where needed + +**Loki too slow**: +- Add more labels for better filtering +- Use chunk caching +- Consider **parallel query execution** + +--- + +## 9. Maintenance Comparison + +### Datadog (Managed) +- **Ops burden**: Low (fully managed) +- **Upgrades**: Automatic +- **Scaling**: Automatic +- **Cost**: High ($6k-10k+/month) + +### Open-Source Stack (Self-hosted) +- **Ops burden**: Medium (requires ops team) +- **Upgrades**: Manual (quarterly) +- **Scaling**: Manual planning required +- **Cost**: Low ($1.5k-3k/month infrastructure) + +**Hybrid Option**: Use **Grafana Cloud** (managed Prometheus/Loki/Tempo) +- Cost: ~$3k/month for 100 hosts +- Ops burden: Low +- Savings: ~50% vs Datadog + +--- + +## 10. ROI Calculation + +### Example Scenario + +**Before (Datadog)**: +- Monthly cost: $7,000 +- Annual cost: $84,000 + +**After (Self-hosted OSS)**: +- Infrastructure: $1,800/month +- Operations (0.5 FTE): $4,000/month +- Annual cost: $69,600 + +**Savings**: $14,400/year + +**After (Grafana Cloud)**: +- Monthly cost: $3,500 +- Annual cost: $42,000 + +**Savings**: $42,000/year (50%) + +**Break-even**: Immediate (no migration costs beyond engineering time) + +--- + +## Resources + +- **Prometheus**: https://prometheus.io/docs/ +- **Grafana**: https://grafana.com/docs/ +- **Loki**: https://grafana.com/docs/loki/ +- **Tempo**: https://grafana.com/docs/tempo/ +- **OpenTelemetry**: https://opentelemetry.io/ +- **Migration Tools**: https://github.com/grafana/dashboard-linter + +--- + +## Support + +If you need help with migration: +- Grafana Labs offers migration consulting +- Many SRE consulting firms specialize in this +- Community support via Slack/Discord channels diff --git a/references/dql_promql_translation.md b/references/dql_promql_translation.md new file mode 100644 index 0000000..1dd3d12 --- /dev/null +++ b/references/dql_promql_translation.md @@ -0,0 +1,756 @@ +# DQL (Datadog Query Language) ↔ PromQL Translation Guide + +## Quick Reference + +| Concept | Datadog (DQL) | Prometheus (PromQL) | +|---------|---------------|---------------------| +| Aggregation | `avg:`, `sum:`, `min:`, `max:` | `avg()`, `sum()`, `min()`, `max()` | +| Rate | `.as_rate()`, `.as_count()` | `rate()`, `increase()` | +| Percentile | `p50:`, `p95:`, `p99:` | `histogram_quantile()` | +| Filtering | `{tag:value}` | `{label="value"}` | +| Time window | `last_5m`, `last_1h` | `[5m]`, `[1h]` | + +--- + +## Basic Queries + +### Simple Metric Query + +**Datadog**: +``` +system.cpu.user +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user"} +``` + +--- + +### Metric with Filter + +**Datadog**: +``` +system.cpu.user{host:web-01} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance="web-01"} +``` + +--- + +### Multiple Filters (AND) + +**Datadog**: +``` +system.cpu.user{host:web-01,env:production} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance="web-01", env="production"} +``` + +--- + +### Wildcard Filters + +**Datadog**: +``` +system.cpu.user{host:web-*} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance=~"web-.*"} +``` + +--- + +### OR Filters + +**Datadog**: +``` +system.cpu.user{host:web-01 OR host:web-02} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance=~"web-01|web-02"} +``` + +--- + +## Aggregations + +### Average + +**Datadog**: +``` +avg:system.cpu.user{*} +``` + +**Prometheus**: +```promql +avg(node_cpu_seconds_total{mode="user"}) +``` + +--- + +### Sum + +**Datadog**: +``` +sum:requests.count{*} +``` + +**Prometheus**: +```promql +sum(http_requests_total) +``` + +--- + +### Min/Max + +**Datadog**: +``` +min:system.mem.free{*} +max:system.mem.free{*} +``` + +**Prometheus**: +```promql +min(node_memory_MemFree_bytes) +max(node_memory_MemFree_bytes) +``` + +--- + +### Aggregation by Tag/Label + +**Datadog**: +``` +avg:system.cpu.user{*} by {host} +``` + +**Prometheus**: +```promql +avg by (instance) (node_cpu_seconds_total{mode="user"}) +``` + +--- + +## Rates and Counts + +### Rate (per second) + +**Datadog**: +``` +sum:requests.count{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(http_requests_total[5m])) +``` + +Note: Prometheus requires explicit time window `[5m]` + +--- + +### Count (total over time) + +**Datadog**: +``` +sum:requests.count{*}.as_count() +``` + +**Prometheus**: +```promql +sum(increase(http_requests_total[1h])) +``` + +--- + +### Derivative (change over time) + +**Datadog**: +``` +derivative(avg:system.disk.used{*}) +``` + +**Prometheus**: +```promql +deriv(node_filesystem_size_bytes[5m]) +``` + +--- + +## Percentiles + +### P50 (Median) + +**Datadog**: +``` +p50:request.duration{*} +``` + +**Prometheus** (requires histogram): +```promql +histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +### P95 + +**Datadog**: +``` +p95:request.duration{*} +``` + +**Prometheus**: +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +### P99 + +**Datadog**: +``` +p99:request.duration{*} +``` + +**Prometheus**: +```promql +histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +## Time Windows + +### Last 5 minutes + +**Datadog**: +``` +avg(last_5m):system.cpu.user{*} +``` + +**Prometheus**: +```promql +avg(node_cpu_seconds_total{mode="user"}[5m]) +``` + +--- + +### Last 1 hour + +**Datadog**: +``` +avg(last_1h):system.cpu.user{*} +``` + +**Prometheus**: +```promql +avg_over_time(node_cpu_seconds_total{mode="user"}[1h]) +``` + +--- + +## Math Operations + +### Division + +**Datadog**: +``` +avg:system.mem.used{*} / avg:system.mem.total{*} +``` + +**Prometheus**: +```promql +node_memory_MemUsed_bytes / node_memory_MemTotal_bytes +``` + +--- + +### Multiplication + +**Datadog**: +``` +avg:system.cpu.user{*} * 100 +``` + +**Prometheus**: +```promql +avg(node_cpu_seconds_total{mode="user"}) * 100 +``` + +--- + +### Percentage Calculation + +**Datadog**: +``` +(sum:requests.errors{*} / sum:requests.count{*}) * 100 +``` + +**Prometheus**: +```promql +(sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100 +``` + +--- + +## Common Use Cases + +### CPU Usage Percentage + +**Datadog**: +``` +100 - avg:system.cpu.idle{*} +``` + +**Prometheus**: +```promql +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) +``` + +--- + +### Memory Usage Percentage + +**Datadog**: +``` +(avg:system.mem.used{*} / avg:system.mem.total{*}) * 100 +``` + +**Prometheus**: +```promql +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 +``` + +--- + +### Disk Usage Percentage + +**Datadog**: +``` +(avg:system.disk.used{*} / avg:system.disk.total{*}) * 100 +``` + +**Prometheus**: +```promql +(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 +``` + +--- + +### Request Rate (requests/sec) + +**Datadog**: +``` +sum:requests.count{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(http_requests_total[5m])) +``` + +--- + +### Error Rate Percentage + +**Datadog**: +``` +(sum:requests.errors{*}.as_rate() / sum:requests.count{*}.as_rate()) * 100 +``` + +**Prometheus**: +```promql +(sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100 +``` + +--- + +### Request Latency (P95) + +**Datadog**: +``` +p95:request.duration{*} +``` + +**Prometheus**: +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +### Top 5 Hosts by CPU + +**Datadog**: +``` +top(avg:system.cpu.user{*} by {host}, 5, 'mean', 'desc') +``` + +**Prometheus**: +```promql +topk(5, avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[5m]))) +``` + +--- + +## Functions + +### Absolute Value + +**Datadog**: +``` +abs(diff(avg:system.cpu.user{*})) +``` + +**Prometheus**: +```promql +abs(delta(node_cpu_seconds_total{mode="user"}[5m])) +``` + +--- + +### Ceiling/Floor + +**Datadog**: +``` +ceil(avg:system.cpu.user{*}) +floor(avg:system.cpu.user{*}) +``` + +**Prometheus**: +```promql +ceil(avg(node_cpu_seconds_total{mode="user"})) +floor(avg(node_cpu_seconds_total{mode="user"})) +``` + +--- + +### Clamp (Limit Range) + +**Datadog**: +``` +clamp_min(avg:system.cpu.user{*}, 0) +clamp_max(avg:system.cpu.user{*}, 100) +``` + +**Prometheus**: +```promql +clamp_min(avg(node_cpu_seconds_total{mode="user"}), 0) +clamp_max(avg(node_cpu_seconds_total{mode="user"}), 100) +``` + +--- + +### Moving Average + +**Datadog**: +``` +moving_rollup(avg:system.cpu.user{*}, 60, 'avg') +``` + +**Prometheus**: +```promql +avg_over_time(node_cpu_seconds_total{mode="user"}[1h]) +``` + +--- + +## Advanced Patterns + +### Compare to Previous Period + +**Datadog**: +``` +sum:requests.count{*}.as_rate() / timeshift(sum:requests.count{*}.as_rate(), 3600) +``` + +**Prometheus**: +```promql +sum(rate(http_requests_total[5m])) / sum(rate(http_requests_total[5m] offset 1h)) +``` + +--- + +### Forecast + +**Datadog**: +``` +forecast(avg:system.disk.used{*}, 'linear', 1) +``` + +**Prometheus**: +```promql +predict_linear(node_filesystem_size_bytes[1h], 3600) +``` + +Note: Predicts value 1 hour in future based on last 1 hour trend + +--- + +### Anomaly Detection + +**Datadog**: +``` +anomalies(avg:system.cpu.user{*}, 'basic', 2) +``` + +**Prometheus**: No built-in function +- Use recording rules with stddev +- External tools like **Robust Perception's anomaly detector** +- Or use **Grafana ML** plugin + +--- + +### Outlier Detection + +**Datadog**: +``` +outliers(avg:system.cpu.user{*} by {host}, 'mad') +``` + +**Prometheus**: No built-in function +- Calculate manually with stddev: +```promql +abs(metric - avg(metric)) > 2 * stddev(metric) +``` + +--- + +## Container & Kubernetes + +### Container CPU Usage + +**Datadog**: +``` +avg:docker.cpu.usage{*} by {container_name} +``` + +**Prometheus**: +```promql +avg by (container) (rate(container_cpu_usage_seconds_total[5m])) +``` + +--- + +### Container Memory Usage + +**Datadog**: +``` +avg:docker.mem.rss{*} by {container_name} +``` + +**Prometheus**: +```promql +avg by (container) (container_memory_rss) +``` + +--- + +### Pod Count by Status + +**Datadog**: +``` +sum:kubernetes.pods.running{*} by {kube_namespace} +``` + +**Prometheus**: +```promql +sum by (namespace) (kube_pod_status_phase{phase="Running"}) +``` + +--- + +## Database Queries + +### MySQL Queries Per Second + +**Datadog**: +``` +sum:mysql.performance.queries{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(mysql_global_status_queries[5m])) +``` + +--- + +### PostgreSQL Active Connections + +**Datadog**: +``` +avg:postgresql.connections{*} +``` + +**Prometheus**: +```promql +avg(pg_stat_database_numbackends) +``` + +--- + +### Redis Memory Usage + +**Datadog**: +``` +avg:redis.mem.used{*} +``` + +**Prometheus**: +```promql +avg(redis_memory_used_bytes) +``` + +--- + +## Network Metrics + +### Network Bytes Sent + +**Datadog**: +``` +sum:system.net.bytes_sent{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(node_network_transmit_bytes_total[5m])) +``` + +--- + +### Network Bytes Received + +**Datadog**: +``` +sum:system.net.bytes_rcvd{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(node_network_receive_bytes_total[5m])) +``` + +--- + +## Key Differences + +### 1. Time Windows +- **Datadog**: Optional, defaults to query time range +- **Prometheus**: Always required for rate/increase functions + +### 2. Histograms +- **Datadog**: Percentiles available directly +- **Prometheus**: Requires histogram buckets + `histogram_quantile()` + +### 3. Default Aggregation +- **Datadog**: No default, must specify +- **Prometheus**: Returns all time series unless aggregated + +### 4. Metric Types +- **Datadog**: All metrics treated similarly +- **Prometheus**: Explicit types (counter, gauge, histogram, summary) + +### 5. Tag vs Label +- **Datadog**: Uses "tags" (key:value) +- **Prometheus**: Uses "labels" (key="value") + +--- + +## Migration Tips + +1. **Start with dashboards**: Convert most-used dashboards first +2. **Use recording rules**: Pre-calculate expensive PromQL queries +3. **Test in parallel**: Run both systems during migration +4. **Document mappings**: Create team-specific translation guide +5. **Train team**: PromQL has learning curve, invest in training + +--- + +## Tools + +- **Datadog Dashboard Exporter**: Export JSON dashboards +- **Grafana Dashboard Linter**: Validate converted dashboards +- **PromQL Learning Resources**: https://prometheus.io/docs/prometheus/latest/querying/basics/ + +--- + +## Common Gotchas + +### Rate without Time Window + +❌ **Wrong**: +```promql +rate(http_requests_total) +``` + +✅ **Correct**: +```promql +rate(http_requests_total[5m]) +``` + +--- + +### Aggregating Before Rate + +❌ **Wrong**: +```promql +rate(sum(http_requests_total)[5m]) +``` + +✅ **Correct**: +```promql +sum(rate(http_requests_total[5m])) +``` + +--- + +### Histogram Quantile Without by (le) + +❌ **Wrong**: +```promql +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +``` + +✅ **Correct**: +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +## Quick Conversion Checklist + +When converting a Datadog query to PromQL: + +- [ ] Replace metric name (e.g., `system.cpu.user` → `node_cpu_seconds_total`) +- [ ] Convert tags to labels (`{tag:value}` → `{label="value"}`) +- [ ] Add time window for rate/increase (`[5m]`) +- [ ] Change aggregation syntax (`avg:` → `avg()`) +- [ ] Convert percentiles to histogram_quantile if needed +- [ ] Test query in Prometheus before adding to dashboard +- [ ] Add `by (label)` for grouped aggregations + +--- + +## Need More Help? + +- See `datadog_migration.md` for full migration guide +- PromQL documentation: https://prometheus.io/docs/prometheus/latest/querying/ +- Practice at: https://demo.promlens.com/ diff --git a/references/logging_guide.md b/references/logging_guide.md new file mode 100644 index 0000000..8b11d44 --- /dev/null +++ b/references/logging_guide.md @@ -0,0 +1,775 @@ +# Logging Guide + +## Structured Logging + +### Why Structured Logs? + +**Unstructured** (text): +``` +2024-10-28 14:32:15 User john@example.com logged in from 192.168.1.1 +``` + +**Structured** (JSON): +```json +{ + "timestamp": "2024-10-28T14:32:15Z", + "level": "info", + "message": "User logged in", + "user": "john@example.com", + "ip": "192.168.1.1", + "event_type": "user_login" +} +``` + +**Benefits**: +- Easy to parse and query +- Consistent format +- Machine-readable +- Efficient storage and indexing + +--- + +## Log Levels + +Use appropriate log levels for better filtering and alerting. + +### DEBUG +**When**: Development, troubleshooting +**Examples**: +- Function entry/exit +- Variable values +- Internal state changes + +```python +logger.debug("Processing request", extra={ + "request_id": req_id, + "params": params +}) +``` + +### INFO +**When**: Important business events +**Examples**: +- User actions (login, purchase) +- System state changes (started, stopped) +- Significant milestones + +```python +logger.info("Order placed", extra={ + "order_id": "12345", + "user_id": "user123", + "amount": 99.99 +}) +``` + +### WARN +**When**: Potentially problematic situations +**Examples**: +- Deprecated API usage +- Slow operations (but not failing) +- Retry attempts +- Resource usage approaching limits + +```python +logger.warning("API response slow", extra={ + "endpoint": "/api/users", + "duration_ms": 2500, + "threshold_ms": 1000 +}) +``` + +### ERROR +**When**: Error conditions that need attention +**Examples**: +- Failed requests +- Exceptions caught and handled +- Integration failures +- Data validation errors + +```python +logger.error("Payment processing failed", extra={ + "order_id": "12345", + "error": str(e), + "payment_gateway": "stripe" +}, exc_info=True) +``` + +### FATAL/CRITICAL +**When**: Severe errors causing shutdown +**Examples**: +- Database connection lost +- Out of memory +- Configuration errors preventing startup + +```python +logger.critical("Database connection lost", extra={ + "database": "postgres", + "host": "db.example.com", + "attempt": 3 +}) +``` + +--- + +## Required Fields + +Every log entry should include: + +### 1. Timestamp +ISO 8601 format with timezone: +```json +{ + "timestamp": "2024-10-28T14:32:15.123Z" +} +``` + +### 2. Level +Standard levels: debug, info, warn, error, critical +```json +{ + "level": "error" +} +``` + +### 3. Message +Human-readable description: +```json +{ + "message": "User authentication failed" +} +``` + +### 4. Service/Application +What component logged this: +```json +{ + "service": "api-gateway", + "version": "1.2.3" +} +``` + +### 5. Environment +```json +{ + "environment": "production" +} +``` + +--- + +## Recommended Fields + +### Request Context +```json +{ + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "user_id": "user123", + "session_id": "sess_abc", + "ip_address": "192.168.1.1", + "user_agent": "Mozilla/5.0..." +} +``` + +### Performance Metrics +```json +{ + "duration_ms": 245, + "response_size_bytes": 1024 +} +``` + +### Error Details +```json +{ + "error_type": "ValidationError", + "error_message": "Invalid email format", + "stack_trace": "...", + "error_code": "VAL_001" +} +``` + +### Business Context +```json +{ + "order_id": "ORD-12345", + "customer_id": "CUST-789", + "transaction_amount": 99.99, + "payment_method": "credit_card" +} +``` + +--- + +## Implementation Examples + +### Python (using structlog) +```python +import structlog + +logger = structlog.get_logger() + +# Configure structured logging +structlog.configure( + processors=[ + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.add_log_level, + structlog.processors.JSONRenderer() + ] +) + +# Usage +logger.info( + "user_logged_in", + user_id="user123", + ip_address="192.168.1.1", + login_method="oauth" +) +``` + +### Node.js (using Winston) +```javascript +const winston = require('winston'); + +const logger = winston.createLogger({ + format: winston.format.json(), + defaultMeta: { service: 'api-gateway' }, + transports: [ + new winston.transports.Console() + ] +}); + +logger.info('User logged in', { + userId: 'user123', + ipAddress: '192.168.1.1', + loginMethod: 'oauth' +}); +``` + +### Go (using zap) +```go +import "go.uber.org/zap" + +logger, _ := zap.NewProduction() +defer logger.Sync() + +logger.Info("User logged in", + zap.String("userId", "user123"), + zap.String("ipAddress", "192.168.1.1"), + zap.String("loginMethod", "oauth"), +) +``` + +### Java (using Logback with JSON) +```java +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import net.logstash.logback.argument.StructuredArguments; + +Logger logger = LoggerFactory.getLogger(MyClass.class); + +logger.info("User logged in", + StructuredArguments.kv("userId", "user123"), + StructuredArguments.kv("ipAddress", "192.168.1.1"), + StructuredArguments.kv("loginMethod", "oauth") +); +``` + +--- + +## Log Aggregation Patterns + +### Pattern 1: ELK Stack (Elasticsearch, Logstash, Kibana) + +**Architecture**: +``` +Application → Filebeat → Logstash → Elasticsearch → Kibana +``` + +**filebeat.yml**: +```yaml +filebeat.inputs: + - type: log + enabled: true + paths: + - /var/log/app/*.log + json.keys_under_root: true + json.add_error_key: true + +output.logstash: + hosts: ["logstash:5044"] +``` + +**logstash.conf**: +``` +input { + beats { + port => 5044 + } +} + +filter { + json { + source => "message" + } + + date { + match => ["timestamp", "ISO8601"] + } + + grok { + match => { "message" => "%{COMBINEDAPACHELOG}" } + } +} + +output { + elasticsearch { + hosts => ["elasticsearch:9200"] + index => "app-logs-%{+YYYY.MM.dd}" + } +} +``` + +### Pattern 2: Loki (Grafana Loki) + +**Architecture**: +``` +Application → Promtail → Loki → Grafana +``` + +**promtail-config.yml**: +```yaml +server: + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: app + static_configs: + - targets: + - localhost + labels: + job: app + __path__: /var/log/app/*.log + pipeline_stages: + - json: + expressions: + level: level + timestamp: timestamp + - labels: + level: + service: + - timestamp: + source: timestamp + format: RFC3339 +``` + +**Query in Grafana**: +```logql +{job="app"} |= "error" | json | level="error" +``` + +### Pattern 3: CloudWatch Logs + +**Install CloudWatch agent**: +```json +{ + "logs": { + "logs_collected": { + "files": { + "collect_list": [ + { + "file_path": "/var/log/app/*.log", + "log_group_name": "/aws/app/production", + "log_stream_name": "{instance_id}", + "timezone": "UTC" + } + ] + } + } + } +} +``` + +**Query with CloudWatch Insights**: +``` +fields @timestamp, level, message, user_id +| filter level = "error" +| sort @timestamp desc +| limit 100 +``` + +### Pattern 4: Fluentd/Fluent Bit + +**fluent-bit.conf**: +``` +[INPUT] + Name tail + Path /var/log/app/*.log + Parser json + Tag app.* + +[FILTER] + Name record_modifier + Match * + Record hostname ${HOSTNAME} + Record cluster production + +[OUTPUT] + Name es + Match * + Host elasticsearch + Port 9200 + Index app-logs + Type _doc +``` + +--- + +## Query Patterns + +### Find Errors in Time Range +**Elasticsearch**: +```json +GET /app-logs-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "level": "error" } }, + { "range": { "@timestamp": { + "gte": "now-1h", + "lte": "now" + }}} + ] + } + } +} +``` + +**Loki (LogQL)**: +```logql +{job="app", level="error"} |= "error" +``` + +**CloudWatch Insights**: +``` +fields @timestamp, @message +| filter level = "error" +| filter @timestamp > ago(1h) +``` + +### Count Errors by Type +**Elasticsearch**: +```json +GET /app-logs-*/_search +{ + "size": 0, + "query": { "match": { "level": "error" } }, + "aggs": { + "error_types": { + "terms": { "field": "error_type.keyword" } + } + } +} +``` + +**Loki**: +```logql +sum by (error_type) (count_over_time({job="app", level="error"}[1h])) +``` + +### Find Slow Requests +**Elasticsearch**: +```json +GET /app-logs-*/_search +{ + "query": { + "range": { "duration_ms": { "gte": 1000 } } + }, + "sort": [ { "duration_ms": "desc" } ] +} +``` + +### Trace Request Through Services +**Elasticsearch** (using request_id): +```json +GET /_search +{ + "query": { + "match": { "request_id": "550e8400-e29b-41d4-a716-446655440000" } + }, + "sort": [ { "@timestamp": "asc" } ] +} +``` + +--- + +## Sampling and Rate Limiting + +### When to Sample +- **High volume services**: > 10,000 logs/second +- **Debug logs in production**: Sample 1-10% +- **Cost optimization**: Reduce storage costs + +### Sampling Strategies + +**1. Random Sampling**: +```python +import random + +if random.random() < 0.1: # Sample 10% + logger.debug("Debug message", ...) +``` + +**2. Rate Limiting**: +```python +from rate_limiter import RateLimiter + +limiter = RateLimiter(max_per_second=100) + +if limiter.allow(): + logger.info("Rate limited log", ...) +``` + +**3. Error-Biased Sampling**: +```python +# Always log errors, sample successful requests +if level == "error" or random.random() < 0.01: + logger.log(level, message, ...) +``` + +**4. Head-Based Sampling** (trace-aware): +```python +# If trace is sampled, log all related logs +if trace_context.is_sampled(): + logger.info("Traced log", trace_id=trace_context.trace_id) +``` + +--- + +## Log Retention + +### Retention Strategy + +**Hot tier** (fast SSD): 7-30 days +- Recent logs +- Full query performance +- High cost + +**Warm tier** (regular disk): 30-90 days +- Older logs +- Slower queries acceptable +- Medium cost + +**Cold tier** (object storage): 90+ days +- Archive logs +- Query via restore +- Low cost + +### Example: Elasticsearch ILM Policy +```json +{ + "policy": { + "phases": { + "hot": { + "actions": { + "rollover": { + "max_size": "50GB", + "max_age": "1d" + } + } + }, + "warm": { + "min_age": "7d", + "actions": { + "allocate": { "number_of_replicas": 1 }, + "shrink": { "number_of_shards": 1 } + } + }, + "cold": { + "min_age": "30d", + "actions": { + "allocate": { "require": { "box_type": "cold" } } + } + }, + "delete": { + "min_age": "90d", + "actions": { + "delete": {} + } + } + } + } +} +``` + +--- + +## Security and Compliance + +### PII Redaction + +**Before logging**: +```python +import re + +def redact_pii(data): + # Redact email + data = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', + '[EMAIL]', data) + # Redact credit card + data = re.sub(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', + '[CARD]', data) + # Redact SSN + data = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', data) + return data + +logger.info("User data", user_input=redact_pii(user_input)) +``` + +**In Logstash**: +``` +filter { + mutate { + gsub => [ + "message", "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "[EMAIL]", + "message", "\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b", "[CARD]" + ] + } +} +``` + +### Access Control + +**Elasticsearch** (with Security): +```yaml +# Role for developers +dev_logs: + indices: + - names: ['app-logs-*'] + privileges: ['read'] + query: '{"match": {"environment": "development"}}' +``` + +**CloudWatch** (IAM Policy): +```json +{ + "Effect": "Allow", + "Action": [ + "logs:DescribeLogGroups", + "logs:GetLogEvents", + "logs:FilterLogEvents" + ], + "Resource": "arn:aws:logs:*:*:log-group:/aws/app/production:*" +} +``` + +--- + +## Common Pitfalls + +### 1. Logging Sensitive Data +❌ `logger.info("Login", password=password)` +✅ `logger.info("Login", user_id=user_id)` + +### 2. Excessive Logging +❌ Logging every iteration of a loop +✅ Log aggregate results or sample + +### 3. Not Including Context +❌ `logger.error("Failed")` +✅ `logger.error("Payment failed", order_id=order_id, error=str(e))` + +### 4. Inconsistent Formats +❌ Mix of JSON and plain text +✅ Pick one format and stick to it + +### 5. No Request IDs +❌ Can't trace request across services +✅ Generate and propagate request_id + +### 6. Logging to Multiple Places +❌ Log to file AND stdout AND syslog +✅ Log to stdout, let agent handle routing + +### 7. Blocking on Log Writes +❌ Synchronous writes to remote systems +✅ Asynchronous buffered writes + +--- + +## Performance Optimization + +### 1. Async Logging +```python +import logging +from logging.handlers import QueueHandler, QueueListener +import queue + +# Create queue +log_queue = queue.Queue() + +# Configure async handler +queue_handler = QueueHandler(log_queue) +logger.addHandler(queue_handler) + +# Process logs in background thread +listener = QueueListener(log_queue, *handlers) +listener.start() +``` + +### 2. Conditional Logging +```python +# Avoid expensive operations if not logging +if logger.isEnabledFor(logging.DEBUG): + logger.debug("Details", data=expensive_serialization(obj)) +``` + +### 3. Batching +```python +# Batch logs before sending +batch = [] +for log in logs: + batch.append(log) + if len(batch) >= 100: + send_to_aggregator(batch) + batch = [] +``` + +### 4. Compression +```yaml +# Filebeat with compression +output.logstash: + hosts: ["logstash:5044"] + compression_level: 3 +``` + +--- + +## Monitoring Log Pipeline + +Track pipeline health with metrics: + +```promql +# Log ingestion rate +rate(logs_ingested_total[5m]) + +# Pipeline lag +log_processing_lag_seconds + +# Dropped logs +rate(logs_dropped_total[5m]) + +# Error parsing rate +rate(logs_parse_errors_total[5m]) +``` + +Alert on: +- Sudden drop in log volume (service down?) +- High parse error rate (format changed?) +- Pipeline lag > 1 minute (capacity issue?) diff --git a/references/metrics_design.md b/references/metrics_design.md new file mode 100644 index 0000000..d915742 --- /dev/null +++ b/references/metrics_design.md @@ -0,0 +1,406 @@ +# Metrics Design Guide + +## The Four Golden Signals + +The Four Golden Signals from Google's SRE book provide a comprehensive view of system health: + +### 1. Latency +**What**: Time to service a request + +**Why Monitor**: Directly impacts user experience + +**Key Metrics**: +- Request duration (p50, p95, p99, p99.9) +- Time to first byte (TTFB) +- Backend processing time +- Database query latency + +**PromQL Examples**: +```promql +# P95 latency +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +# Average latency by endpoint +avg(rate(http_request_duration_seconds_sum[5m])) by (endpoint) + / +avg(rate(http_request_duration_seconds_count[5m])) by (endpoint) +``` + +**Alert Thresholds**: +- Warning: p95 > 500ms +- Critical: p99 > 2s + +### 2. Traffic +**What**: Demand on your system + +**Why Monitor**: Understand load patterns, capacity planning + +**Key Metrics**: +- Requests per second (RPS) +- Transactions per second (TPS) +- Concurrent connections +- Network throughput + +**PromQL Examples**: +```promql +# Requests per second +sum(rate(http_requests_total[5m])) + +# Requests per second by status code +sum(rate(http_requests_total[5m])) by (status) + +# Traffic growth rate (week over week) +sum(rate(http_requests_total[5m])) + / +sum(rate(http_requests_total[5m] offset 7d)) +``` + +**Alert Thresholds**: +- Warning: RPS > 80% of capacity +- Critical: RPS > 95% of capacity + +### 3. Errors +**What**: Rate of requests that fail + +**Why Monitor**: Direct indicator of user-facing problems + +**Key Metrics**: +- Error rate (%) +- 5xx response codes +- Failed transactions +- Exception counts + +**PromQL Examples**: +```promql +# Error rate percentage +sum(rate(http_requests_total{status=~"5.."}[5m])) + / +sum(rate(http_requests_total[5m])) * 100 + +# Error count by type +sum(rate(http_requests_total{status=~"5.."}[5m])) by (status) + +# Application errors +rate(application_errors_total[5m]) +``` + +**Alert Thresholds**: +- Warning: Error rate > 1% +- Critical: Error rate > 5% + +### 4. Saturation +**What**: How "full" your service is + +**Why Monitor**: Predict capacity issues before they impact users + +**Key Metrics**: +- CPU utilization +- Memory utilization +- Disk I/O +- Network bandwidth +- Queue depth +- Thread pool usage + +**PromQL Examples**: +```promql +# CPU saturation +100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Memory saturation +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 + +# Disk saturation +rate(node_disk_io_time_seconds_total[5m]) * 100 + +# Queue depth +queue_depth_current / queue_depth_max * 100 +``` + +**Alert Thresholds**: +- Warning: > 70% utilization +- Critical: > 90% utilization + +--- + +## RED Method (for Services) + +**R**ate, **E**rrors, **D**uration - a simplified approach for request-driven services + +### Rate +Number of requests per second: +```promql +sum(rate(http_requests_total[5m])) +``` + +### Errors +Number of failed requests per second: +```promql +sum(rate(http_requests_total{status=~"5.."}[5m])) +``` + +### Duration +Time taken to process requests: +```promql +histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +**When to Use**: Microservices, APIs, web applications + +--- + +## USE Method (for Resources) + +**U**tilization, **S**aturation, **E**rrors - for infrastructure resources + +### Utilization +Percentage of time resource is busy: +```promql +# CPU utilization +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Disk utilization +(node_filesystem_size_bytes - node_filesystem_avail_bytes) + / node_filesystem_size_bytes * 100 +``` + +### Saturation +Amount of work the resource cannot service (queued): +```promql +# Load average (saturation indicator) +node_load15 + +# Disk I/O wait time +rate(node_disk_io_time_weighted_seconds_total[5m]) +``` + +### Errors +Count of error events: +```promql +# Network errors +rate(node_network_receive_errs_total[5m]) +rate(node_network_transmit_errs_total[5m]) + +# Disk errors +rate(node_disk_io_errors_total[5m]) +``` + +**When to Use**: Servers, databases, network devices + +--- + +## Metric Types + +### Counter +Monotonically increasing value (never decreases) + +**Examples**: Request count, error count, bytes sent + +**Usage**: +```promql +# Always use rate() or increase() with counters +rate(http_requests_total[5m]) # Requests per second +increase(http_requests_total[1h]) # Total requests in 1 hour +``` + +### Gauge +Value that can go up and down + +**Examples**: Memory usage, queue depth, concurrent connections + +**Usage**: +```promql +# Use directly or with aggregations +avg(memory_usage_bytes) +max(queue_depth) +``` + +### Histogram +Samples observations and counts them in configurable buckets + +**Examples**: Request duration, response size + +**Usage**: +```promql +# Calculate percentiles +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +# Average from histogram +rate(http_request_duration_seconds_sum[5m]) + / +rate(http_request_duration_seconds_count[5m]) +``` + +### Summary +Similar to histogram but calculates quantiles on client side + +**Usage**: Less flexible than histograms, avoid for new metrics + +--- + +## Cardinality Best Practices + +**Cardinality**: Number of unique time series + +### High Cardinality Labels (AVOID) +❌ User ID +❌ Email address +❌ IP address +❌ Timestamp +❌ Random IDs + +### Low Cardinality Labels (GOOD) +✅ Environment (prod, staging) +✅ Region (us-east-1, eu-west-1) +✅ Service name +✅ HTTP status code category (2xx, 4xx, 5xx) +✅ Endpoint/route + +### Calculating Cardinality Impact +``` +Time series = unique combinations of labels + +Example: +service (5) × environment (3) × region (4) × status (5) = 300 time series ✅ + +service (5) × environment (3) × region (4) × user_id (1M) = 60M time series ❌ +``` + +--- + +## Naming Conventions + +### Prometheus Naming +``` +___total + +Examples: +http_requests_total +http_request_duration_seconds +process_cpu_seconds_total +node_memory_MemAvailable_bytes +``` + +**Rules**: +- Use snake_case +- Include unit in name (seconds, bytes, ratio) +- Use `_total` suffix for counters +- Namespace by application/component + +### CloudWatch Naming +``` +/ + +Examples: +AWS/EC2/CPUUtilization +MyApp/RequestCount +``` + +**Rules**: +- Use PascalCase +- Group by namespace +- No unit in name (specified separately) + +--- + +## Dashboard Design + +### Key Principles + +1. **Top-Down Layout**: Most important metrics first +2. **Color Coding**: Red (critical), yellow (warning), green (healthy) +3. **Consistent Time Windows**: All panels use same time range +4. **Limit Panels**: 8-12 panels per dashboard maximum +5. **Include Context**: Show related metrics together + +### Dashboard Structure + +``` +┌─────────────────────────────────────────────┐ +│ Overall Health (Single Stats) │ +│ [Requests/s] [Error%] [P95 Latency] │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Request Rate & Errors (Graphs) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Latency Distribution (Graphs) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Resource Usage (Graphs) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Dependencies (Graphs) │ +└─────────────────────────────────────────────┘ +``` + +### Template Variables +Use variables for filtering: +- Environment: `$environment` +- Service: `$service` +- Region: `$region` +- Pod: `$pod` + +--- + +## Common Pitfalls + +### 1. Monitoring What You Build, Not What Users Experience +❌ `backend_processing_complete` +✅ `user_request_completed` + +### 2. Too Many Metrics +- Start with Four Golden Signals +- Add metrics only when needed for specific issues +- Remove unused metrics + +### 3. Incorrect Aggregations +❌ `avg(rate(...))` - averages rates incorrectly +✅ `sum(rate(...)) / count(...)` - correct average + +### 4. Wrong Time Windows +- Too short (< 1m): Noisy data +- Too long (> 15m): Miss short-lived issues +- Sweet spot: 5m for most alerts + +### 5. Missing Labels +❌ `http_requests_total` +✅ `http_requests_total{method="GET", status="200", endpoint="/api/users"}` + +--- + +## Metric Collection Best Practices + +### Application Instrumentation +```python +from prometheus_client import Counter, Histogram, Gauge + +# Counter for requests +requests_total = Counter('http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status']) + +# Histogram for latency +request_duration = Histogram('http_request_duration_seconds', + 'HTTP request duration', + ['method', 'endpoint']) + +# Gauge for in-progress requests +requests_in_progress = Gauge('http_requests_in_progress', + 'HTTP requests currently being processed') +``` + +### Collection Intervals +- Application metrics: 15-30s +- Infrastructure metrics: 30-60s +- Billing/cost metrics: 5-15m +- External API checks: 1-5m + +### Retention +- Raw metrics: 15-30 days +- 5m aggregates: 90 days +- 1h aggregates: 1 year +- Daily aggregates: 2+ years diff --git a/references/slo_sla_guide.md b/references/slo_sla_guide.md new file mode 100644 index 0000000..3704fd9 --- /dev/null +++ b/references/slo_sla_guide.md @@ -0,0 +1,652 @@ +# SLI, SLO, and SLA Guide + +## Definitions + +### SLI (Service Level Indicator) +**What**: A quantitative measure of service quality + +**Examples**: +- Request latency (ms) +- Error rate (%) +- Availability (%) +- Throughput (requests/sec) + +### SLO (Service Level Objective) +**What**: Target value or range for an SLI + +**Examples**: +- "99.9% of requests return in < 500ms" +- "99.95% availability" +- "Error rate < 0.1%" + +### SLA (Service Level Agreement) +**What**: Business contract with consequences for SLO violations + +**Examples**: +- "99.9% uptime or 10% monthly credit" +- "p95 latency < 1s or refund" + +### Relationship +``` +SLI = Measurement +SLO = Target (internal goal) +SLA = Promise (customer contract with penalties) + +Example: +SLI: Actual availability this month = 99.92% +SLO: Target availability = 99.9% +SLA: Guaranteed availability = 99.5% (with penalties) +``` + +--- + +## Choosing SLIs + +### The Four Golden Signals as SLIs + +1. **Latency SLIs** + - Request duration (p50, p95, p99) + - Time to first byte + - Page load time + +2. **Availability/Success SLIs** + - % of successful requests + - % uptime + - % of requests completing + +3. **Throughput SLIs** (less common) + - Requests per second + - Transactions per second + +4. **Saturation SLIs** (internal only) + - Resource utilization + - Queue depth + +### SLI Selection Criteria + +✅ **Good SLIs**: +- Measured from user perspective +- Directly impact user experience +- Aggregatable across instances +- Proportional to user happiness + +❌ **Bad SLIs**: +- Internal metrics only +- Not user-facing +- Hard to measure consistently + +### Examples by Service Type + +**Web Application**: +``` +SLI 1: Request Success Rate + = successful_requests / total_requests + +SLI 2: Request Latency (p95) + = 95th percentile of response times + +SLI 3: Availability + = time_service_responding / total_time +``` + +**API Service**: +``` +SLI 1: Error Rate + = (4xx_errors + 5xx_errors) / total_requests + +SLI 2: Response Time (p99) + = 99th percentile latency + +SLI 3: Throughput + = requests_per_second +``` + +**Batch Processing**: +``` +SLI 1: Job Success Rate + = successful_jobs / total_jobs + +SLI 2: Processing Latency + = time_from_submission_to_completion + +SLI 3: Freshness + = age_of_oldest_unprocessed_item +``` + +**Storage Service**: +``` +SLI 1: Durability + = data_not_lost / total_data + +SLI 2: Read Latency (p99) + = 99th percentile read time + +SLI 3: Write Success Rate + = successful_writes / total_writes +``` + +--- + +## Setting SLO Targets + +### Start with Current Performance + +1. **Measure baseline**: Collect 30 days of data +2. **Analyze distribution**: Look at p50, p95, p99, p99.9 +3. **Set initial SLO**: Slightly better than worst performer +4. **Iterate**: Tighten or loosen based on feasibility + +### Example Process + +**Current Performance** (30 days): +``` +p50 latency: 120ms +p95 latency: 450ms +p99 latency: 1200ms +p99.9 latency: 3500ms + +Error rate: 0.05% +Availability: 99.95% +``` + +**Initial SLOs**: +``` +Latency: p95 < 500ms (slightly worse than current p95) +Error rate: < 0.1% (double current rate) +Availability: 99.9% (slightly worse than current) +``` + +**Rationale**: Start loose, prevent false alarms, tighten over time + +### Common SLO Targets + +**Availability**: +- **99%** (3.65 days downtime/year): Internal tools +- **99.5%** (1.83 days/year): Non-critical services +- **99.9%** (8.76 hours/year): Standard production +- **99.95%** (4.38 hours/year): Critical services +- **99.99%** (52 minutes/year): High availability +- **99.999%** (5 minutes/year): Mission critical + +**Latency**: +- **p50 < 100ms**: Excellent responsiveness +- **p95 < 500ms**: Standard web applications +- **p99 < 1s**: Acceptable for most users +- **p99.9 < 5s**: Acceptable for rare edge cases + +**Error Rate**: +- **< 0.01%** (99.99% success): Critical operations +- **< 0.1%** (99.9% success): Standard production +- **< 1%** (99% success): Non-critical services + +--- + +## Error Budgets + +### Concept + +Error budget = (100% - SLO target) + +If SLO is 99.9%, error budget is 0.1% + +**Purpose**: Balance reliability with feature velocity + +### Calculation + +**For availability**: +``` +Monthly error budget = (1 - SLO) × time_period + +Example (99.9% SLO, 30 days): +Error budget = 0.001 × 30 days = 0.03 days = 43.2 minutes +``` + +**For request-based SLIs**: +``` +Error budget = (1 - SLO) × total_requests + +Example (99.9% SLO, 10M requests/month): +Error budget = 0.001 × 10,000,000 = 10,000 failed requests +``` + +### Error Budget Consumption + +**Formula**: +``` +Budget consumed = actual_errors / allowed_errors × 100% + +Example: +SLO: 99.9% (0.1% error budget) +Total requests: 1,000,000 +Failed requests: 500 +Allowed failures: 1,000 + +Budget consumed = 500 / 1,000 × 100% = 50% +Budget remaining = 50% +``` + +### Error Budget Policy + +**Example policy**: + +```markdown +## Error Budget Policy + +### If error budget > 50% +- Deploy frequently (multiple times per day) +- Take calculated risks +- Experiment with new features +- Acceptable to have some incidents + +### If error budget 20-50% +- Deploy normally (once per day) +- Increase testing +- Review recent changes +- Monitor closely + +### If error budget < 20% +- Freeze non-critical deploys +- Focus on reliability improvements +- Postmortem all incidents +- Reduce change velocity + +### If error budget exhausted (< 0%) +- Complete deploy freeze except rollbacks +- All hands on reliability +- Mandatory postmortems +- Executive escalation +``` + +--- + +## Error Budget Burn Rate + +### Concept + +Burn rate = rate of error budget consumption + +**Example**: +- Monthly budget: 43.2 minutes (99.9% SLO) +- If consuming at 2x rate: Budget exhausted in 15 days +- If consuming at 10x rate: Budget exhausted in 3 days + +### Burn Rate Calculation + +``` +Burn rate = (actual_error_rate / allowed_error_rate) + +Example: +SLO: 99.9% (0.1% allowed error rate) +Current error rate: 0.5% + +Burn rate = 0.5% / 0.1% = 5x +Time to exhaust = 30 days / 5 = 6 days +``` + +### Multi-Window Alerting + +Alert on burn rate across multiple time windows: + +**Fast burn** (1 hour window): +``` +Burn rate > 14.4x → Exhausts budget in 2 days +Alert after 2 minutes +Severity: Critical (page immediately) +``` + +**Moderate burn** (6 hour window): +``` +Burn rate > 6x → Exhausts budget in 5 days +Alert after 30 minutes +Severity: Warning (create ticket) +``` + +**Slow burn** (3 day window): +``` +Burn rate > 1x → Exhausts budget by end of month +Alert after 6 hours +Severity: Info (monitor) +``` + +### Implementation + +**Prometheus**: +```yaml +# Fast burn alert (1h window, 2m grace period) +- alert: ErrorBudgetFastBurn + expr: | + ( + sum(rate(http_requests_total{status=~"5.."}[1h])) + / + sum(rate(http_requests_total[1h])) + ) > (14.4 * 0.001) # 14.4x burn rate for 99.9% SLO + for: 2m + labels: + severity: critical + annotations: + summary: "Fast error budget burn detected" + description: "Error budget will be exhausted in 2 days at current rate" + +# Slow burn alert (6h window, 30m grace period) +- alert: ErrorBudgetSlowBurn + expr: | + ( + sum(rate(http_requests_total{status=~"5.."}[6h])) + / + sum(rate(http_requests_total[6h])) + ) > (6 * 0.001) # 6x burn rate for 99.9% SLO + for: 30m + labels: + severity: warning + annotations: + summary: "Elevated error budget burn detected" +``` + +--- + +## SLO Reporting + +### Dashboard Structure + +**Overall Health**: +``` +┌─────────────────────────────────────────┐ +│ SLO Compliance: 99.92% ✅ │ +│ Error Budget Remaining: 73% 🟢 │ +│ Burn Rate: 0.8x 🟢 │ +└─────────────────────────────────────────┘ +``` + +**SLI Performance**: +``` +Latency p95: 420ms (Target: 500ms) ✅ +Error Rate: 0.08% (Target: < 0.1%) ✅ +Availability: 99.95% (Target: > 99.9%) ✅ +``` + +**Error Budget Trend**: +``` +Graph showing: +- Error budget consumption over time +- Burn rate spikes +- Incidents marked +- Deploy events overlaid +``` + +### Monthly SLO Report + +**Template**: +```markdown +# SLO Report: October 2024 + +## Executive Summary +- ✅ All SLOs met this month +- 🟡 Latency SLO came close to violation (99.1% compliance) +- 3 incidents consumed 47% of error budget +- Error budget remaining: 53% + +## SLO Performance + +### Availability SLO: 99.9% +- Actual: 99.92% +- Status: ✅ Met +- Error budget consumed: 33% +- Downtime: 23 minutes (allowed: 43.2 minutes) + +### Latency SLO: p95 < 500ms +- Actual p95: 445ms +- Status: ✅ Met +- Compliance: 99.1% (target: 99%) +- 0.9% of requests exceeded threshold + +### Error Rate SLO: < 0.1% +- Actual: 0.05% +- Status: ✅ Met +- Error budget consumed: 50% + +## Incidents + +### Incident #1: Database Overload (Oct 5) +- Duration: 15 minutes +- Error budget consumed: 35% +- Root cause: Slow query after schema change +- Prevention: Added query review to deploy checklist + +### Incident #2: API Gateway Timeout (Oct 12) +- Duration: 5 minutes +- Error budget consumed: 10% +- Root cause: Configuration error in load balancer +- Prevention: Automated configuration validation + +### Incident #3: Upstream Service Degradation (Oct 20) +- Duration: 3 minutes +- Error budget consumed: 2% +- Root cause: Third-party API outage +- Prevention: Implemented circuit breaker + +## Recommendations +1. Investigate latency near-miss (Oct 15-17) +2. Add automated rollback for database changes +3. Increase circuit breaker thresholds for third-party APIs +4. Consider tightening availability SLO to 99.95% + +## Next Month's Focus +- Reduce p95 latency to 400ms +- Implement automated canary deployments +- Add synthetic monitoring for critical paths +``` + +--- + +## SLA Structure + +### Components + +**Service Description**: +``` +The API Service provides RESTful endpoints for user management, +authentication, and data retrieval. +``` + +**Covered Metrics**: +``` +- Availability: Service is reachable and returns valid responses +- Latency: Time from request to response +- Error Rate: Percentage of requests returning errors +``` + +**SLA Targets**: +``` +Service commits to: +1. 99.9% monthly uptime +2. p95 API response time < 1 second +3. Error rate < 0.5% +``` + +**Measurement**: +``` +Metrics calculated from server-side monitoring: +- Uptime: Successful health check probes / total probes +- Latency: Server-side request duration (p95) +- Errors: HTTP 5xx responses / total responses + +Calculated monthly (first of month for previous month). +``` + +**Exclusions**: +``` +SLA does not cover: +- Scheduled maintenance (with 7 days notice) +- Client-side network issues +- DDoS attacks or force majeure +- Beta/preview features +- Issues caused by customer misuse +``` + +**Service Credits**: +``` +Monthly Uptime | Service Credit +---------------- | -------------- +< 99.9% (SLA) | 10% +< 99.0% | 25% +< 95.0% | 50% +``` + +**Claiming Credits**: +``` +Customer must: +1. Report violation within 30 days +2. Provide ticket numbers for support requests +3. Credits applied to next month's invoice +4. Credits do not exceed monthly fee +``` + +### Example SLAs by Industry + +**E-commerce**: +``` +- 99.95% availability +- p95 page load < 2s +- p99 checkout < 5s +- Credits: 5% per 0.1% below target +``` + +**Financial Services**: +``` +- 99.99% availability +- p99 transaction < 500ms +- Zero data loss +- Penalties: $10,000 per hour of downtime +``` + +**Media/Content**: +``` +- 99.9% availability +- p95 video start < 3s +- No credit system (best effort latency) +``` + +--- + +## Best Practices + +### 1. SLOs Should Be User-Centric +❌ "Database queries < 100ms" +✅ "API response time p95 < 500ms" + +### 2. Start Loose, Tighten Over Time +- Begin with achievable targets +- Build reliability culture +- Gradually raise bar + +### 3. Fewer, Better SLOs +- 1-3 SLOs per service +- Focus on user impact +- Avoid SLO sprawl + +### 4. SLAs More Conservative Than SLOs +``` +Internal SLO: 99.95% +Customer SLA: 99.9% +Margin: 0.05% buffer +``` + +### 5. Make Error Budgets Actionable +- Define policies at different thresholds +- Empower teams to make tradeoffs +- Review in planning meetings + +### 6. Document Everything +- How SLIs are measured +- Why targets were chosen +- Who owns each SLO +- How to interpret metrics + +### 7. Review Regularly +- Monthly SLO reviews +- Quarterly SLO adjustments +- Annual SLA renegotiation + +--- + +## Common Pitfalls + +### 1. Too Many SLOs +❌ 20 different SLOs per service +✅ 2-3 critical SLOs + +### 2. Unrealistic Targets +❌ 99.999% for non-critical service +✅ 99.9% with room to improve + +### 3. SLOs Without Error Budgets +❌ "Must always be 99.9%" +✅ "Budget for 0.1% errors" + +### 4. No Consequences +❌ Missing SLO has no impact +✅ Deploy freeze when budget exhausted + +### 5. SLA Equals SLO +❌ Promise exactly what you target +✅ SLA more conservative than SLO + +### 6. Ignoring User Experience +❌ "Our servers are up 99.99%" +✅ "Users can complete actions 99.9% of the time" + +### 7. Static Targets +❌ Set once, never revisit +✅ Quarterly reviews and adjustments + +--- + +## Tools and Automation + +### SLO Tracking Tools + +**Prometheus + Grafana**: +- Use recording rules for SLIs +- Alert on burn rates +- Dashboard for compliance + +**Google Cloud SLO Monitoring**: +- Built-in SLO tracking +- Automatic error budget calculation +- Integration with alerting + +**Datadog SLOs**: +- UI for SLO definition +- Automatic burn rate alerts +- Status pages + +**Custom Tools**: +- sloth: Generate Prometheus rules from SLO definitions +- slo-libsonnet: Jsonnet library for SLO monitoring + +### Example: Prometheus Recording Rules + +```yaml +groups: + - name: sli_recording + interval: 30s + rules: + # SLI: Request success rate + - record: sli:request_success:ratio + expr: | + sum(rate(http_requests_total{status!~"5.."}[5m])) + / + sum(rate(http_requests_total[5m])) + + # SLI: Request latency (p95) + - record: sli:request_latency:p95 + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) + ) + + # Error budget burn rate (1h window) + - record: slo:error_budget_burn_rate:1h + expr: | + (1 - sli:request_success:ratio) / 0.001 +``` diff --git a/references/tool_comparison.md b/references/tool_comparison.md new file mode 100644 index 0000000..849b21a --- /dev/null +++ b/references/tool_comparison.md @@ -0,0 +1,697 @@ +# Monitoring Tools Comparison + +## Overview Matrix + +| Tool | Type | Best For | Complexity | Cost | Cloud/Self-Hosted | +|------|------|----------|------------|------|-------------------| +| **Prometheus** | Metrics | Kubernetes, time-series | Medium | Free | Self-hosted | +| **Grafana** | Visualization | Dashboards, multi-source | Low-Medium | Free | Both | +| **Datadog** | Full-stack | Ease of use, APM | Low | High | Cloud | +| **New Relic** | Full-stack | APM, traces | Low | High | Cloud | +| **Elasticsearch (ELK)** | Logs | Log search, analysis | High | Medium | Both | +| **Grafana Loki** | Logs | Cost-effective logs | Medium | Free | Both | +| **CloudWatch** | AWS-native | AWS infrastructure | Low | Medium | Cloud | +| **Jaeger** | Tracing | Distributed tracing | Medium | Free | Self-hosted | +| **Grafana Tempo** | Tracing | Cost-effective tracing | Medium | Free | Self-hosted | + +--- + +## Metrics Platforms + +### Prometheus + +**Type**: Open-source time-series database + +**Strengths**: +- ✅ Industry standard for Kubernetes +- ✅ Powerful query language (PromQL) +- ✅ Pull-based model (no agent config) +- ✅ Service discovery +- ✅ Free and open source +- ✅ Huge ecosystem (exporters for everything) + +**Weaknesses**: +- ❌ No built-in dashboards (need Grafana) +- ❌ Single-node only (no HA without federation) +- ❌ Limited long-term storage (need Thanos/Cortex) +- ❌ Steep learning curve for PromQL + +**Best For**: +- Kubernetes monitoring +- Infrastructure metrics +- Custom application metrics +- Organizations that need control + +**Pricing**: Free (open source) + +**Setup Complexity**: Medium + +**Example**: +```yaml +# prometheus.yml +scrape_configs: + - job_name: 'app' + static_configs: + - targets: ['localhost:8080'] +``` + +--- + +### Datadog + +**Type**: SaaS monitoring platform + +**Strengths**: +- ✅ Easy to set up (install agent, done) +- ✅ Beautiful pre-built dashboards +- ✅ APM, logs, metrics, traces in one platform +- ✅ Great anomaly detection +- ✅ Excellent integrations (500+) +- ✅ Good mobile app + +**Weaknesses**: +- ❌ Very expensive at scale +- ❌ Vendor lock-in +- ❌ Cost can be unpredictable (per-host pricing) +- ❌ Limited PromQL support + +**Best For**: +- Teams that want quick setup +- Companies prioritizing ease of use over cost +- Organizations needing full observability + +**Pricing**: $15-$31/host/month + custom metrics fees + +**Setup Complexity**: Low + +**Example**: +```bash +# Install agent +DD_API_KEY=xxx bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)" +``` + +--- + +### New Relic + +**Type**: SaaS application performance monitoring + +**Strengths**: +- ✅ Excellent APM capabilities +- ✅ User-friendly interface +- ✅ Good transaction tracing +- ✅ Comprehensive alerting +- ✅ Generous free tier + +**Weaknesses**: +- ❌ Can get expensive at scale +- ❌ Vendor lock-in +- ❌ Query language less powerful than PromQL +- ❌ Limited customization + +**Best For**: +- Application performance monitoring +- Teams focused on APM over infrastructure +- Startups (free tier is generous) + +**Pricing**: Free up to 100GB/month, then $0.30/GB + +**Setup Complexity**: Low + +**Example**: +```python +import newrelic.agent +newrelic.agent.initialize('newrelic.ini') +``` + +--- + +### CloudWatch + +**Type**: AWS-native monitoring + +**Strengths**: +- ✅ Zero setup for AWS services +- ✅ Native integration with AWS +- ✅ Automatic dashboards for AWS resources +- ✅ Tightly integrated with other AWS services +- ✅ Good for cost if already on AWS + +**Weaknesses**: +- ❌ AWS-only (not multi-cloud) +- ❌ Limited query capabilities +- ❌ High costs for custom metrics +- ❌ Basic visualization +- ❌ 1-minute minimum resolution + +**Best For**: +- AWS-centric infrastructure +- Quick setup for AWS services +- Organizations already invested in AWS + +**Pricing**: +- First 10 custom metrics: Free +- Additional: $0.30/metric/month +- API calls: $0.01/1000 requests + +**Setup Complexity**: Low (for AWS), Medium (for custom metrics) + +**Example**: +```python +import boto3 +cloudwatch = boto3.client('cloudwatch') +cloudwatch.put_metric_data( + Namespace='MyApp', + MetricData=[{'MetricName': 'RequestCount', 'Value': 1}] +) +``` + +--- + +### Grafana Cloud / Mimir + +**Type**: Managed Prometheus-compatible + +**Strengths**: +- ✅ Prometheus-compatible (PromQL) +- ✅ Managed service (no ops burden) +- ✅ Good cost model (pay for what you use) +- ✅ Grafana dashboards included +- ✅ Long-term storage + +**Weaknesses**: +- ❌ Relatively new (less mature) +- ❌ Some Prometheus features missing +- ❌ Requires Grafana for visualization + +**Best For**: +- Teams wanting Prometheus without ops overhead +- Multi-cloud environments +- Organizations already using Grafana + +**Pricing**: $8/month + $0.29/1M samples + +**Setup Complexity**: Low-Medium + +--- + +## Logging Platforms + +### Elasticsearch (ELK Stack) + +**Type**: Open-source log search and analytics + +**Full Stack**: Elasticsearch + Logstash + Kibana + +**Strengths**: +- ✅ Powerful search capabilities +- ✅ Rich query language +- ✅ Great for log analysis +- ✅ Mature ecosystem +- ✅ Can handle large volumes +- ✅ Flexible data model + +**Weaknesses**: +- ❌ Complex to operate +- ❌ Resource intensive (RAM hungry) +- ❌ Expensive at scale +- ❌ Requires dedicated ops team +- ❌ Slow for high-cardinality queries + +**Best For**: +- Large organizations with ops teams +- Deep log analysis needs +- Search-heavy use cases + +**Pricing**: Free (open source) + infrastructure costs + +**Infrastructure**: ~$500-2000/month for medium scale + +**Setup Complexity**: High + +**Example**: +```json +PUT /logs-2024.10/_doc/1 +{ + "timestamp": "2024-10-28T14:32:15Z", + "level": "error", + "message": "Payment failed" +} +``` + +--- + +### Grafana Loki + +**Type**: Log aggregation system + +**Strengths**: +- ✅ Cost-effective (labels only, not full-text indexing) +- ✅ Easy to operate +- ✅ Prometheus-like label model +- ✅ Great Grafana integration +- ✅ Low resource usage +- ✅ Fast time-range queries + +**Weaknesses**: +- ❌ Limited full-text search +- ❌ Requires careful label design +- ❌ Younger ecosystem than ELK +- ❌ Not ideal for complex queries + +**Best For**: +- Cost-conscious organizations +- Kubernetes environments +- Teams already using Prometheus +- Time-series log queries + +**Pricing**: Free (open source) + infrastructure costs + +**Infrastructure**: ~$100-500/month for medium scale + +**Setup Complexity**: Medium + +**Example**: +```logql +{job="api", environment="prod"} |= "error" | json | level="error" +``` + +--- + +### Splunk + +**Type**: Enterprise log management + +**Strengths**: +- ✅ Extremely powerful search +- ✅ Great for security/compliance +- ✅ Mature platform +- ✅ Enterprise support +- ✅ Machine learning features + +**Weaknesses**: +- ❌ Very expensive +- ❌ Complex pricing (per GB ingested) +- ❌ Steep learning curve +- ❌ Heavy resource usage + +**Best For**: +- Large enterprises +- Security operations centers (SOCs) +- Compliance-heavy industries + +**Pricing**: $150-$1800/GB/month (depending on tier) + +**Setup Complexity**: Medium-High + +--- + +### CloudWatch Logs + +**Type**: AWS-native log management + +**Strengths**: +- ✅ Zero setup for AWS services +- ✅ Integrated with AWS ecosystem +- ✅ CloudWatch Insights for queries +- ✅ Reasonable cost for low volume + +**Weaknesses**: +- ❌ AWS-only +- ❌ Limited query capabilities +- ❌ Expensive at high volume +- ❌ Basic visualization + +**Best For**: +- AWS-centric applications +- Low-volume logging +- Simple log aggregation + +**Pricing**: Tiered (as of May 2025) +- Vended Logs: $0.50/GB (first 10TB), $0.25/GB (next 20TB), then lower tiers +- Standard logs: $0.50/GB flat +- Storage: $0.03/GB + +**Setup Complexity**: Low (AWS), Medium (custom) + +--- + +### Sumo Logic + +**Type**: SaaS log management + +**Strengths**: +- ✅ Easy to use +- ✅ Good for cloud-native apps +- ✅ Real-time analytics +- ✅ Good compliance features + +**Weaknesses**: +- ❌ Expensive at scale +- ❌ Vendor lock-in +- ❌ Limited customization + +**Best For**: +- Cloud-native applications +- Teams wanting managed solution +- Security and compliance use cases + +**Pricing**: $90-$180/GB/month + +**Setup Complexity**: Low + +--- + +## Tracing Platforms + +### Jaeger + +**Type**: Open-source distributed tracing + +**Strengths**: +- ✅ Industry standard +- ✅ CNCF graduated project +- ✅ Supports OpenTelemetry +- ✅ Good UI +- ✅ Free and open source + +**Weaknesses**: +- ❌ Requires separate storage backend +- ❌ Limited query capabilities +- ❌ No built-in analytics + +**Best For**: +- Microservices architectures +- Kubernetes environments +- OpenTelemetry users + +**Pricing**: Free (open source) + storage costs + +**Setup Complexity**: Medium + +--- + +### Grafana Tempo + +**Type**: Open-source distributed tracing + +**Strengths**: +- ✅ Cost-effective (object storage) +- ✅ Easy to operate +- ✅ Great Grafana integration +- ✅ TraceQL query language +- ✅ Supports OpenTelemetry + +**Weaknesses**: +- ❌ Younger than Jaeger +- ❌ Limited third-party integrations +- ❌ Requires Grafana for UI + +**Best For**: +- Cost-conscious organizations +- Teams using Grafana stack +- High trace volumes + +**Pricing**: Free (open source) + storage costs + +**Setup Complexity**: Medium + +--- + +### Datadog APM + +**Type**: SaaS application performance monitoring + +**Strengths**: +- ✅ Easy to set up +- ✅ Excellent trace visualization +- ✅ Integrated with metrics/logs +- ✅ Automatic service map +- ✅ Good profiling features + +**Weaknesses**: +- ❌ Expensive ($31/host/month) +- ❌ Vendor lock-in +- ❌ Limited sampling control + +**Best For**: +- Teams wanting ease of use +- Organizations already using Datadog +- Complex microservices + +**Pricing**: $31/host/month + $1.70/million spans + +**Setup Complexity**: Low + +--- + +### AWS X-Ray + +**Type**: AWS-native distributed tracing + +**Strengths**: +- ✅ Native AWS integration +- ✅ Automatic instrumentation for AWS services +- ✅ Low cost + +**Weaknesses**: +- ❌ AWS-only +- ❌ Basic UI +- ❌ Limited query capabilities + +**Best For**: +- AWS-centric applications +- Serverless architectures (Lambda) +- Cost-sensitive projects + +**Pricing**: $5/million traces, first 100k free/month + +**Setup Complexity**: Low (AWS), Medium (custom) + +--- + +## Full-Stack Observability + +### Datadog (Full Platform) + +**Components**: Metrics, logs, traces, RUM, synthetics + +**Strengths**: +- ✅ Everything in one platform +- ✅ Excellent user experience +- ✅ Correlation across signals +- ✅ Great for teams + +**Weaknesses**: +- ❌ Very expensive ($50-100+/host/month) +- ❌ Vendor lock-in +- ❌ Unpredictable costs + +**Total Cost** (example 100 hosts): +- Infrastructure: $3,100/month +- APM: $3,100/month +- Logs: ~$2,000/month +- **Total: ~$8,000/month** + +--- + +### Grafana Stack (LGTM) + +**Components**: Loki (logs), Grafana (viz), Tempo (traces), Mimir/Prometheus (metrics) + +**Strengths**: +- ✅ Open source and cost-effective +- ✅ Unified visualization +- ✅ Prometheus-compatible +- ✅ Great for cloud-native + +**Weaknesses**: +- ❌ Requires self-hosting or Grafana Cloud +- ❌ More ops burden +- ❌ Less polished than commercial tools + +**Total Cost** (self-hosted, 100 hosts): +- Infrastructure: ~$1,500/month +- Ops time: Variable +- **Total: ~$1,500-3,000/month** + +--- + +### Elastic Observability + +**Components**: Elasticsearch (logs), Kibana (viz), APM, metrics + +**Strengths**: +- ✅ Powerful search +- ✅ Mature platform +- ✅ Good for log-heavy use cases + +**Weaknesses**: +- ❌ Complex to operate +- ❌ Expensive infrastructure +- ❌ Resource intensive + +**Total Cost** (self-hosted, 100 hosts): +- Infrastructure: ~$3,000-5,000/month +- Ops time: High +- **Total: ~$4,000-7,000/month** + +--- + +### New Relic One + +**Components**: Metrics, logs, traces, synthetics + +**Strengths**: +- ✅ Generous free tier (100GB) +- ✅ User-friendly +- ✅ Good for startups + +**Weaknesses**: +- ❌ Costs increase quickly after free tier +- ❌ Vendor lock-in + +**Total Cost**: +- Free: up to 100GB/month +- Paid: $0.30/GB beyond 100GB + +--- + +## Cloud Provider Native + +### AWS (CloudWatch + X-Ray) + +**Use When**: +- Primarily on AWS +- Simple monitoring needs +- Want minimal setup + +**Avoid When**: +- Multi-cloud environment +- Need advanced features +- High log volume (expensive) + +**Cost** (example): +- 100 EC2 instances with basic metrics: ~$150/month +- 1TB logs: ~$500/month ingestion + storage +- X-Ray: ~$50/month + +--- + +### GCP (Cloud Monitoring + Cloud Trace) + +**Use When**: +- Primarily on GCP +- Using GKE +- Want tight GCP integration + +**Avoid When**: +- Multi-cloud environment +- Need advanced querying + +**Cost** (example): +- First 150MB/month per resource: Free +- Additional: $0.2508/MB + +--- + +### Azure (Azure Monitor) + +**Use When**: +- Primarily on Azure +- Using AKS +- Need Azure integration + +**Avoid When**: +- Multi-cloud +- Need advanced features + +**Cost** (example): +- First 5GB: Free +- Additional: $2.76/GB + +--- + +## Decision Matrix + +### Choose Prometheus + Grafana If: +- ✅ Using Kubernetes +- ✅ Want control and customization +- ✅ Have ops capacity +- ✅ Budget-conscious +- ✅ Need Prometheus ecosystem + +### Choose Datadog If: +- ✅ Want ease of use +- ✅ Need full observability now +- ✅ Budget allows ($8k+/month for 100 hosts) +- ✅ Limited ops team +- ✅ Need excellent UX + +### Choose ELK If: +- ✅ Heavy log analysis needs +- ✅ Need powerful search +- ✅ Have dedicated ops team +- ✅ Compliance requirements +- ✅ Willing to invest in infrastructure + +### Choose Grafana Stack (LGTM) If: +- ✅ Want open source full stack +- ✅ Cost-effective solution +- ✅ Cloud-native architecture +- ✅ Already using Prometheus +- ✅ Have some ops capacity + +### Choose New Relic If: +- ✅ Startup with free tier +- ✅ APM is priority +- ✅ Want easy setup +- ✅ Don't need heavy customization + +### Choose Cloud Native (CloudWatch/etc) If: +- ✅ Single cloud provider +- ✅ Simple needs +- ✅ Want minimal setup +- ✅ Low to medium scale + +--- + +## Cost Comparison + +**Example: 100 hosts, 1TB logs/month, 1M spans/day** + +| Solution | Monthly Cost | Setup | Ops Burden | +|----------|-------------|--------|------------| +| **Prometheus + Loki + Tempo** | $1,500 | Medium | Medium | +| **Grafana Cloud** | $3,000 | Low | Low | +| **Datadog** | $8,000 | Low | None | +| **New Relic** | $3,500 | Low | None | +| **ELK Stack** | $4,000 | High | High | +| **CloudWatch** | $2,000 | Low | Low | + +--- + +## Recommendations by Company Size + +### Startup (< 10 engineers) +**Recommendation**: New Relic or Grafana Cloud +- Minimal ops burden +- Good free tiers +- Easy to get started + +### Small Company (10-50 engineers) +**Recommendation**: Prometheus + Grafana + Loki (self-hosted or cloud) +- Cost-effective +- Growing ops capacity +- Flexibility + +### Medium Company (50-200 engineers) +**Recommendation**: Datadog or Grafana Stack +- Datadog if budget allows +- Grafana Stack if cost-conscious + +### Large Enterprise (200+ engineers) +**Recommendation**: Build observability platform +- Mix of tools based on needs +- Dedicated observability team +- Custom integrations diff --git a/references/tracing_guide.md b/references/tracing_guide.md new file mode 100644 index 0000000..7e72fcb --- /dev/null +++ b/references/tracing_guide.md @@ -0,0 +1,663 @@ +# Distributed Tracing Guide + +## What is Distributed Tracing? + +Distributed tracing tracks a request as it flows through multiple services in a distributed system. + +### Key Concepts + +**Trace**: End-to-end journey of a request +**Span**: Single operation within a trace +**Context**: Metadata propagated between services (trace_id, span_id) + +### Example Flow +``` +User Request → API Gateway → Auth Service → User Service → Database + ↓ ↓ ↓ + [Trace ID: abc123] + Span 1: gateway (50ms) + Span 2: auth (20ms) + Span 3: user_service (100ms) + Span 4: db_query (80ms) + +Total: 250ms with waterfall view showing dependencies +``` + +--- + +## OpenTelemetry (OTel) + +OpenTelemetry is the industry standard for instrumentation. + +### Components + +**API**: Instrument code (create spans, add attributes) +**SDK**: Implement API, configure exporters +**Collector**: Receive, process, and export telemetry data +**Exporters**: Send data to backends (Jaeger, Tempo, Zipkin) + +### Architecture +``` +Application → OTel SDK → OTel Collector → Backend (Jaeger/Tempo) + ↓ + Visualization +``` + +--- + +## Instrumentation Examples + +### Python (using OpenTelemetry) + +**Setup**: +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +# Setup tracer +trace.set_tracer_provider(TracerProvider()) +tracer = trace.get_tracer(__name__) + +# Configure exporter +otlp_exporter = OTLPSpanExporter(endpoint="localhost:4317") +span_processor = BatchSpanProcessor(otlp_exporter) +trace.get_tracer_provider().add_span_processor(span_processor) +``` + +**Manual instrumentation**: +```python +from opentelemetry import trace + +tracer = trace.get_tracer(__name__) + +@tracer.start_as_current_span("process_order") +def process_order(order_id): + span = trace.get_current_span() + span.set_attribute("order.id", order_id) + span.set_attribute("order.amount", 99.99) + + try: + result = payment_service.charge(order_id) + span.set_attribute("payment.status", "success") + return result + except Exception as e: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + span.record_exception(e) + raise +``` + +**Auto-instrumentation** (Flask example): +```python +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor + +# Auto-instrument Flask +FlaskInstrumentor().instrument_app(app) + +# Auto-instrument requests library +RequestsInstrumentor().instrument() + +# Auto-instrument SQLAlchemy +SQLAlchemyInstrumentor().instrument(engine=db.engine) +``` + +### Node.js (using OpenTelemetry) + +**Setup**: +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); + +// Setup provider +const provider = new NodeTracerProvider(); +const exporter = new OTLPTraceExporter({ url: 'localhost:4317' }); +provider.addSpanProcessor(new BatchSpanProcessor(exporter)); +provider.register(); +``` + +**Manual instrumentation**: +```javascript +const tracer = provider.getTracer('my-service'); + +async function processOrder(orderId) { + const span = tracer.startSpan('process_order'); + span.setAttribute('order.id', orderId); + + try { + const result = await paymentService.charge(orderId); + span.setAttribute('payment.status', 'success'); + return result; + } catch (error) { + span.setStatus({ code: SpanStatusCode.ERROR }); + span.recordException(error); + throw error; + } finally { + span.end(); + } +} +``` + +**Auto-instrumentation**: +```javascript +const { registerInstrumentations } = require('@opentelemetry/instrumentation'); +const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http'); +const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express'); +const { MongoDBInstrumentation } = require('@opentelemetry/instrumentation-mongodb'); + +registerInstrumentations({ + instrumentations: [ + new HttpInstrumentation(), + new ExpressInstrumentation(), + new MongoDBInstrumentation() + ] +}); +``` + +### Go (using OpenTelemetry) + +**Setup**: +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/trace" +) + +func initTracer() { + exporter, _ := otlptracegrpc.New(context.Background()) + tp := trace.NewTracerProvider( + trace.WithBatcher(exporter), + ) + otel.SetTracerProvider(tp) +} +``` + +**Manual instrumentation**: +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" +) + +func processOrder(ctx context.Context, orderID string) error { + tracer := otel.Tracer("my-service") + ctx, span := tracer.Start(ctx, "process_order") + defer span.End() + + span.SetAttributes( + attribute.String("order.id", orderID), + attribute.Float64("order.amount", 99.99), + ) + + err := paymentService.Charge(ctx, orderID) + if err != nil { + span.RecordError(err) + return err + } + + span.SetAttributes(attribute.String("payment.status", "success")) + return nil +} +``` + +--- + +## Span Attributes + +### Semantic Conventions + +Follow OpenTelemetry semantic conventions for consistency: + +**HTTP**: +```python +span.set_attribute("http.method", "GET") +span.set_attribute("http.url", "https://api.example.com/users") +span.set_attribute("http.status_code", 200) +span.set_attribute("http.user_agent", "Mozilla/5.0...") +``` + +**Database**: +```python +span.set_attribute("db.system", "postgresql") +span.set_attribute("db.name", "users_db") +span.set_attribute("db.statement", "SELECT * FROM users WHERE id = ?") +span.set_attribute("db.operation", "SELECT") +``` + +**RPC/gRPC**: +```python +span.set_attribute("rpc.system", "grpc") +span.set_attribute("rpc.service", "UserService") +span.set_attribute("rpc.method", "GetUser") +span.set_attribute("rpc.grpc.status_code", 0) +``` + +**Messaging**: +```python +span.set_attribute("messaging.system", "kafka") +span.set_attribute("messaging.destination", "user-events") +span.set_attribute("messaging.operation", "publish") +span.set_attribute("messaging.message_id", "msg123") +``` + +### Custom Attributes + +Add business context: +```python +span.set_attribute("user.id", "user123") +span.set_attribute("order.id", "ORD-456") +span.set_attribute("feature.flag.checkout_v2", True) +span.set_attribute("cache.hit", False) +``` + +--- + +## Context Propagation + +### W3C Trace Context (Standard) + +Headers propagated between services: +``` +traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01 +tracestate: vendor1=value1,vendor2=value2 +``` + +**Format**: `version-trace_id-parent_span_id-trace_flags` + +### Implementation + +**Python**: +```python +from opentelemetry.propagate import inject, extract +import requests + +# Inject context into outgoing request +headers = {} +inject(headers) +requests.get("https://api.example.com", headers=headers) + +# Extract context from incoming request +from flask import request +ctx = extract(request.headers) +``` + +**Node.js**: +```javascript +const { propagation } = require('@opentelemetry/api'); + +// Inject +const headers = {}; +propagation.inject(context.active(), headers); +axios.get('https://api.example.com', { headers }); + +// Extract +const ctx = propagation.extract(context.active(), req.headers); +``` + +**HTTP Example**: +```bash +curl -H "traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01" \ + https://api.example.com/users +``` + +--- + +## Sampling Strategies + +### 1. Always On/Off +```python +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ALWAYS_OFF + +# Development: trace everything +provider = TracerProvider(sampler=ALWAYS_ON) + +# Production: trace nothing (usually not desired) +provider = TracerProvider(sampler=ALWAYS_OFF) +``` + +### 2. Probability-Based +```python +from opentelemetry.sdk.trace.sampling import TraceIdRatioBased + +# Sample 10% of traces +provider = TracerProvider(sampler=TraceIdRatioBased(0.1)) +``` + +### 3. Rate Limiting +```python +from opentelemetry.sdk.trace.sampling import ParentBased, RateLimitingSampler + +# Sample max 100 traces per second +sampler = ParentBased(root=RateLimitingSampler(100)) +provider = TracerProvider(sampler=sampler) +``` + +### 4. Parent-Based (Default) +```python +from opentelemetry.sdk.trace.sampling import ParentBased, TraceIdRatioBased + +# If parent span is sampled, sample child spans +sampler = ParentBased(root=TraceIdRatioBased(0.1)) +provider = TracerProvider(sampler=sampler) +``` + +### 5. Custom Sampling +```python +from opentelemetry.sdk.trace.sampling import Sampler, Decision + +class ErrorSampler(Sampler): + """Always sample errors, sample 1% of successes""" + + def should_sample(self, parent_context, trace_id, name, **kwargs): + attributes = kwargs.get('attributes', {}) + + # Always sample if error + if attributes.get('error', False): + return Decision.RECORD_AND_SAMPLE + + # Sample 1% of successes + if trace_id & 0xFF < 3: # ~1% + return Decision.RECORD_AND_SAMPLE + + return Decision.DROP + +provider = TracerProvider(sampler=ErrorSampler()) +``` + +--- + +## Backends + +### Jaeger + +**Docker Compose**: +```yaml +version: '3' +services: + jaeger: + image: jaegertracing/all-in-one:latest + ports: + - "16686:16686" # UI + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + environment: + - COLLECTOR_OTLP_ENABLED=true +``` + +**Query traces**: +```bash +# UI: http://localhost:16686 + +# API: Get trace by ID +curl http://localhost:16686/api/traces/abc123 + +# Search traces +curl "http://localhost:16686/api/traces?service=my-service&limit=20" +``` + +### Grafana Tempo + +**Docker Compose**: +```yaml +version: '3' +services: + tempo: + image: grafana/tempo:latest + ports: + - "3200:3200" # Tempo + - "4317:4317" # OTLP gRPC + volumes: + - ./tempo.yaml:/etc/tempo.yaml + command: ["-config.file=/etc/tempo.yaml"] +``` + +**tempo.yaml**: +```yaml +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +storage: + trace: + backend: local + local: + path: /tmp/tempo/traces +``` + +**Query in Grafana**: +- Install Tempo data source +- Use TraceQL: `{ span.http.status_code = 500 }` + +### AWS X-Ray + +**Configuration**: +```python +from aws_xray_sdk.core import xray_recorder +from aws_xray_sdk.ext.flask.middleware import XRayMiddleware + +xray_recorder.configure(service='my-service') +XRayMiddleware(app, xray_recorder) +``` + +**Query**: +```bash +aws xray get-trace-summaries \ + --start-time 2024-10-28T00:00:00 \ + --end-time 2024-10-28T23:59:59 \ + --filter-expression 'error = true' +``` + +--- + +## Analysis Patterns + +### Find Slow Traces +``` +# Jaeger UI +- Filter by service +- Set min duration: 1000ms +- Sort by duration + +# TraceQL (Tempo) +{ duration > 1s } +``` + +### Find Error Traces +``` +# Jaeger UI +- Filter by tag: error=true +- Or by HTTP status: http.status_code=500 + +# TraceQL (Tempo) +{ span.http.status_code >= 500 } +``` + +### Find Traces by User +``` +# Jaeger UI +- Filter by tag: user.id=user123 + +# TraceQL (Tempo) +{ span.user.id = "user123" } +``` + +### Find N+1 Query Problems +Look for: +- Many sequential database spans +- Same query repeated multiple times +- Pattern: API call → DB query → DB query → DB query... + +### Find Service Bottlenecks +- Identify spans with longest duration +- Check if time is spent in service logic or waiting for dependencies +- Look at span relationships (parallel vs sequential) + +--- + +## Integration with Logs + +### Trace ID in Logs + +**Python**: +```python +from opentelemetry import trace + +def add_trace_context(): + span = trace.get_current_span() + trace_id = span.get_span_context().trace_id + span_id = span.get_span_context().span_id + + return { + "trace_id": format(trace_id, '032x'), + "span_id": format(span_id, '016x') + } + +logger.info("Processing order", **add_trace_context(), order_id=order_id) +``` + +**Query logs for trace**: +``` +# Elasticsearch +GET /logs/_search +{ + "query": { + "match": { "trace_id": "0af7651916cd43dd8448eb211c80319c" } + } +} + +# Loki (LogQL) +{job="app"} |= "0af7651916cd43dd8448eb211c80319c" +``` + +### Trace from Log (Grafana) + +Configure derived fields in Grafana: +```yaml +datasources: + - name: Loki + type: loki + jsonData: + derivedFields: + - name: TraceID + matcherRegex: "trace_id=([\\w]+)" + url: "http://tempo:3200/trace/$${__value.raw}" + datasourceUid: tempo_uid +``` + +--- + +## Best Practices + +### 1. Span Naming +✅ Use operation names, not IDs +- Good: `GET /api/users`, `UserService.GetUser`, `db.query.users` +- Bad: `/api/users/123`, `span_abc`, `query_1` + +### 2. Span Granularity +✅ One span per logical operation +- Too coarse: One span for entire request +- Too fine: Span for every variable assignment +- Just right: Span per service call, database query, external API + +### 3. Add Context +Always include: +- Operation name +- Service name +- Error status +- Business identifiers (user_id, order_id) + +### 4. Handle Errors +```python +try: + result = operation() +except Exception as e: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + span.record_exception(e) + raise +``` + +### 5. Sampling Strategy +- Development: 100% +- Staging: 50-100% +- Production: 1-10% (or error-based) + +### 6. Performance Impact +- Overhead: ~1-5% CPU +- Use async exporters +- Batch span exports +- Sample appropriately + +### 7. Cardinality +Avoid high-cardinality attributes: +- ❌ Email addresses +- ❌ Full URLs with unique IDs +- ❌ Timestamps +- ✅ User ID +- ✅ Endpoint pattern +- ✅ Status code + +--- + +## Common Issues + +### Missing Traces +**Cause**: Context not propagated +**Solution**: Verify headers are injected/extracted + +### Incomplete Traces +**Cause**: Spans not closed properly +**Solution**: Always use `defer span.End()` or context managers + +### High Overhead +**Cause**: Too many spans or synchronous export +**Solution**: Reduce span count, use batch processor + +### No Error Traces +**Cause**: Errors not recorded on spans +**Solution**: Call `span.record_exception()` and set error status + +--- + +## Metrics from Traces + +Generate RED metrics from trace data: + +**Rate**: Traces per second +**Errors**: Traces with error status +**Duration**: Span duration percentiles + +**Example** (using Tempo + Prometheus): +```yaml +# Generate metrics from spans +metrics_generator: + processor: + span_metrics: + dimensions: + - http.method + - http.status_code +``` + +**Query**: +```promql +# Request rate +rate(traces_spanmetrics_calls_total[5m]) + +# Error rate +rate(traces_spanmetrics_calls_total{status_code="STATUS_CODE_ERROR"}[5m]) + / +rate(traces_spanmetrics_calls_total[5m]) + +# P95 latency +histogram_quantile(0.95, traces_spanmetrics_latency_bucket) +``` diff --git a/scripts/alert_quality_checker.py b/scripts/alert_quality_checker.py new file mode 100644 index 0000000..d926bb5 --- /dev/null +++ b/scripts/alert_quality_checker.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +""" +Audit Prometheus alert rules against best practices. +Checks for: alert naming, severity labels, runbook links, expression quality. +""" + +import argparse +import sys +import os +import re +from typing import Dict, List, Any +from pathlib import Path + +try: + import yaml +except ImportError: + print("⚠️ Warning: 'PyYAML' library not found. Install with: pip install pyyaml") + sys.exit(1) + + +class AlertQualityChecker: + def __init__(self): + self.issues = [] + self.warnings = [] + self.recommendations = [] + + def check_alert_name(self, alert_name: str) -> List[str]: + """Check alert naming conventions.""" + issues = [] + + # Should be PascalCase or camelCase + if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name): + issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)") + + # Should be descriptive + if len(alert_name) < 5: + issues.append(f"Alert name '{alert_name}' is too short, use descriptive names") + + # Avoid generic names + generic_names = ['Alert', 'Test', 'Warning', 'Error'] + if alert_name in generic_names: + issues.append(f"Alert name '{alert_name}' is too generic") + + return issues + + def check_labels(self, alert: Dict[str, Any]) -> List[str]: + """Check required and recommended labels.""" + issues = [] + labels = alert.get('labels', {}) + + # Required labels + if 'severity' not in labels: + issues.append("Missing required 'severity' label (critical/warning/info)") + elif labels['severity'] not in ['critical', 'warning', 'info']: + issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info") + + # Recommended labels + if 'team' not in labels: + self.recommendations.append("Consider adding 'team' label for routing") + + if 'component' not in labels and 'service' not in labels: + self.recommendations.append("Consider adding 'component' or 'service' label") + + return issues + + def check_annotations(self, alert: Dict[str, Any]) -> List[str]: + """Check annotations quality.""" + issues = [] + annotations = alert.get('annotations', {}) + + # Required annotations + if 'summary' not in annotations: + issues.append("Missing 'summary' annotation") + elif len(annotations['summary']) < 10: + issues.append("Summary annotation is too short, provide clear description") + + if 'description' not in annotations: + issues.append("Missing 'description' annotation") + + # Runbook + if 'runbook_url' not in annotations and 'runbook' not in annotations: + self.recommendations.append("Consider adding 'runbook_url' for incident response") + + # Check for templating + if 'summary' in annotations: + if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']: + self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})") + + return issues + + def check_expression(self, expr: str, alert_name: str) -> List[str]: + """Check PromQL expression quality.""" + issues = [] + + # Should have a threshold + if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr: + issues.append("Expression should include a comparison operator") + + # Should use rate() for counters + if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr: + self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)") + + # Avoid instant queries without aggregation + if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']): + if expr.count('{') > 1: # Multiple metrics without aggregation + self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.") + + # Check for proper time windows + if '[' not in expr and 'rate(' in expr: + issues.append("rate() requires a time window (e.g., rate(metric[5m]))") + + return issues + + def check_for_duration(self, rule: Dict[str, Any]) -> List[str]: + """Check for 'for' clause to prevent flapping.""" + issues = [] + severity = rule.get('labels', {}).get('severity', 'unknown') + + if 'for' not in rule: + if severity == 'critical': + issues.append("Critical alerts should have 'for' clause to prevent flapping") + else: + self.warnings.append("Consider adding 'for' clause to prevent alert flapping") + else: + # Parse duration + duration = rule['for'] + if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']): + self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts") + + return issues + + def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]: + """Check a single alert rule.""" + alert_name = rule.get('alert', 'Unknown') + issues = [] + + # Check alert name + issues.extend(self.check_alert_name(alert_name)) + + # Check expression + if 'expr' not in rule: + issues.append("Missing 'expr' field") + else: + issues.extend(self.check_expression(rule['expr'], alert_name)) + + # Check labels + issues.extend(self.check_labels(rule)) + + # Check annotations + issues.extend(self.check_annotations(rule)) + + # Check for duration + issues.extend(self.check_for_duration(rule)) + + return { + "alert": alert_name, + "issues": issues, + "severity": rule.get('labels', {}).get('severity', 'unknown') + } + + def analyze_file(self, filepath: str) -> Dict[str, Any]: + """Analyze a Prometheus rules file.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if not data: + return {"error": "Empty or invalid YAML file"} + + results = [] + groups = data.get('groups', []) + + for group in groups: + group_name = group.get('name', 'Unknown') + rules = group.get('rules', []) + + for rule in rules: + # Only check alerting rules, not recording rules + if 'alert' in rule: + result = self.check_alert_rule(rule) + result['group'] = group_name + results.append(result) + + return { + "file": filepath, + "groups": len(groups), + "alerts_checked": len(results), + "results": results + } + + except Exception as e: + return {"error": f"Failed to parse file: {e}"} + + +def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker): + """Pretty print analysis results.""" + print("\n" + "="*60) + print("🚨 ALERT QUALITY CHECK RESULTS") + print("="*60) + + if "error" in analysis: + print(f"\n❌ Error: {analysis['error']}") + return + + print(f"\n📁 File: {analysis['file']}") + print(f"📊 Groups: {analysis['groups']}") + print(f"🔔 Alerts Checked: {analysis['alerts_checked']}") + + # Count issues by severity + critical_count = 0 + warning_count = 0 + + for result in analysis['results']: + if result['issues']: + critical_count += 1 + + print(f"\n{'='*60}") + print(f"📈 Summary:") + print(f" ❌ Alerts with Issues: {critical_count}") + print(f" ⚠️ Warnings: {len(checker.warnings)}") + print(f" 💡 Recommendations: {len(checker.recommendations)}") + + # Print detailed results + if critical_count > 0: + print(f"\n{'='*60}") + print("❌ ALERTS WITH ISSUES:") + print(f"{'='*60}") + + for result in analysis['results']: + if result['issues']: + print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})") + print(f" Severity: {result['severity']}") + print(" Issues:") + for issue in result['issues']: + print(f" • {issue}") + + # Print warnings + if checker.warnings: + print(f"\n{'='*60}") + print("⚠️ WARNINGS:") + print(f"{'='*60}") + for warning in set(checker.warnings): # Remove duplicates + print(f"• {warning}") + + # Print recommendations + if checker.recommendations: + print(f"\n{'='*60}") + print("💡 RECOMMENDATIONS:") + print(f"{'='*60}") + for rec in list(set(checker.recommendations))[:10]: # Top 10 unique recommendations + print(f"• {rec}") + + # Overall score + total_alerts = analysis['alerts_checked'] + if total_alerts > 0: + quality_score = ((total_alerts - critical_count) / total_alerts) * 100 + print(f"\n{'='*60}") + print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)") + print(f"{'='*60}\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Audit Prometheus alert rules for quality and best practices", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check a single file + python3 alert_quality_checker.py alerts.yml + + # Check all YAML files in a directory + python3 alert_quality_checker.py /path/to/prometheus/rules/ + +Best Practices Checked: + ✓ Alert naming conventions (PascalCase, descriptive) + ✓ Required labels (severity) + ✓ Required annotations (summary, description) + ✓ Runbook URL presence + ✓ PromQL expression quality + ✓ 'for' clause to prevent flapping + ✓ Template variable usage + """ + ) + + parser.add_argument('path', help='Path to alert rules file or directory') + parser.add_argument('--verbose', action='store_true', help='Show all recommendations') + + args = parser.parse_args() + + checker = AlertQualityChecker() + + # Check if path is file or directory + path = Path(args.path) + + if path.is_file(): + files = [str(path)] + elif path.is_dir(): + files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')] + else: + print(f"❌ Path not found: {args.path}") + sys.exit(1) + + if not files: + print(f"❌ No YAML files found in: {args.path}") + sys.exit(1) + + print(f"🔍 Checking {len(files)} file(s)...") + + for filepath in files: + analysis = checker.analyze_file(filepath) + print_results(analysis, checker) + + +if __name__ == "__main__": + main() diff --git a/scripts/analyze_metrics.py b/scripts/analyze_metrics.py new file mode 100644 index 0000000..58343bd --- /dev/null +++ b/scripts/analyze_metrics.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +""" +Analyze metrics from Prometheus or CloudWatch and detect anomalies. +Supports: rate of change analysis, spike detection, trend analysis. +""" + +import argparse +import sys +import json +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +import statistics + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + +try: + import boto3 +except ImportError: + boto3 = None + + +class MetricAnalyzer: + def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"): + self.source = source + self.endpoint = endpoint + self.region = region + if source == "cloudwatch" and boto3: + self.cloudwatch = boto3.client('cloudwatch', region_name=region) + elif source == "cloudwatch" and not boto3: + print("⚠️ boto3 not installed. Install with: pip install boto3") + sys.exit(1) + + def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]: + """Query Prometheus for metric data.""" + if not self.endpoint: + print("❌ Prometheus endpoint required") + sys.exit(1) + + try: + # Query range for last N hours + end_time = datetime.now() + start_time = end_time - timedelta(hours=hours) + + params = { + 'query': query, + 'start': start_time.timestamp(), + 'end': end_time.timestamp(), + 'step': '5m' # 5-minute resolution + } + + response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30) + response.raise_for_status() + + data = response.json() + if data['status'] != 'success': + print(f"❌ Prometheus query failed: {data}") + return [] + + return data['data']['result'] + + except Exception as e: + print(f"❌ Error querying Prometheus: {e}") + return [] + + def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str], + hours: int = 24, stat: str = "Average") -> List[Dict]: + """Query CloudWatch for metric data.""" + try: + end_time = datetime.now() + start_time = end_time - timedelta(hours=hours) + + dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()] + + response = self.cloudwatch.get_metric_statistics( + Namespace=namespace, + MetricName=metric_name, + Dimensions=dimensions_list, + StartTime=start_time, + EndTime=end_time, + Period=300, # 5-minute intervals + Statistics=[stat] + ) + + return sorted(response['Datapoints'], key=lambda x: x['Timestamp']) + + except Exception as e: + print(f"❌ Error querying CloudWatch: {e}") + return [] + + def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]: + """Detect anomalies using standard deviation method.""" + if len(values) < 10: + return { + "anomalies_detected": False, + "message": "Insufficient data points for anomaly detection" + } + + mean = statistics.mean(values) + stdev = statistics.stdev(values) + threshold_upper = mean + (sensitivity * stdev) + threshold_lower = mean - (sensitivity * stdev) + + anomalies = [] + for i, value in enumerate(values): + if value > threshold_upper or value < threshold_lower: + anomalies.append({ + "index": i, + "value": value, + "deviation": abs(value - mean) / stdev if stdev > 0 else 0 + }) + + return { + "anomalies_detected": len(anomalies) > 0, + "count": len(anomalies), + "anomalies": anomalies, + "stats": { + "mean": mean, + "stdev": stdev, + "threshold_upper": threshold_upper, + "threshold_lower": threshold_lower, + "total_points": len(values) + } + } + + def analyze_trend(self, values: List[float]) -> Dict[str, Any]: + """Analyze trend using simple linear regression.""" + if len(values) < 2: + return {"trend": "unknown", "message": "Insufficient data"} + + n = len(values) + x = list(range(n)) + x_mean = sum(x) / n + y_mean = sum(values) / n + + numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n)) + denominator = sum((x[i] - x_mean) ** 2 for i in range(n)) + + if denominator == 0: + return {"trend": "flat", "slope": 0} + + slope = numerator / denominator + + # Determine trend direction + if abs(slope) < 0.01 * y_mean: # Less than 1% change per interval + trend = "stable" + elif slope > 0: + trend = "increasing" + else: + trend = "decreasing" + + return { + "trend": trend, + "slope": slope, + "rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0 + } + + +def print_results(results: Dict[str, Any]): + """Pretty print analysis results.""" + print("\n" + "="*60) + print("📊 METRIC ANALYSIS RESULTS") + print("="*60) + + if "error" in results: + print(f"\n❌ Error: {results['error']}") + return + + print(f"\n📈 Data Points: {results.get('data_points', 0)}") + + # Trend analysis + if "trend" in results: + trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "❓") + print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}") + if "rate_of_change" in results["trend"]: + print(f" Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval") + + # Anomaly detection + if "anomalies" in results: + anomaly_data = results["anomalies"] + if anomaly_data["anomalies_detected"]: + print(f"\n⚠️ ANOMALIES DETECTED: {anomaly_data['count']}") + print(f" Mean: {anomaly_data['stats']['mean']:.2f}") + print(f" Std Dev: {anomaly_data['stats']['stdev']:.2f}") + print(f" Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]") + + print("\n Top Anomalies:") + for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]: + print(f" • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)") + else: + print("\n✅ No anomalies detected") + + print("\n" + "="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze metrics from Prometheus or CloudWatch", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Prometheus: Analyze request rate + python3 analyze_metrics.py prometheus \\ + --endpoint http://localhost:9090 \\ + --query 'rate(http_requests_total[5m])' \\ + --hours 24 + + # CloudWatch: Analyze CPU utilization + python3 analyze_metrics.py cloudwatch \\ + --namespace AWS/EC2 \\ + --metric CPUUtilization \\ + --dimensions InstanceId=i-1234567890abcdef0 \\ + --hours 48 + """ + ) + + parser.add_argument('source', choices=['prometheus', 'cloudwatch'], + help='Metric source') + parser.add_argument('--endpoint', help='Prometheus endpoint URL') + parser.add_argument('--query', help='PromQL query') + parser.add_argument('--namespace', help='CloudWatch namespace') + parser.add_argument('--metric', help='CloudWatch metric name') + parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)') + parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)') + parser.add_argument('--sensitivity', type=float, default=2.0, + help='Anomaly detection sensitivity (std deviations, default: 2.0)') + parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)') + + args = parser.parse_args() + + analyzer = MetricAnalyzer(args.source, args.endpoint, args.region) + + # Query metrics + if args.source == 'prometheus': + if not args.query: + print("❌ --query required for Prometheus") + sys.exit(1) + + print(f"🔍 Querying Prometheus: {args.query}") + results = analyzer.query_prometheus(args.query, args.hours) + + if not results: + print("❌ No data returned") + sys.exit(1) + + # Extract values from first result series + values = [float(v[1]) for v in results[0].get('values', [])] + + elif args.source == 'cloudwatch': + if not all([args.namespace, args.metric, args.dimensions]): + print("❌ --namespace, --metric, and --dimensions required for CloudWatch") + sys.exit(1) + + dims = dict(item.split('=') for item in args.dimensions.split(',')) + + print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}") + results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours) + + if not results: + print("❌ No data returned") + sys.exit(1) + + values = [point['Average'] for point in results] + + # Analyze metrics + analysis_results = { + "data_points": len(values), + "trend": analyzer.analyze_trend(values), + "anomalies": analyzer.detect_anomalies(values, args.sensitivity) + } + + print_results(analysis_results) + + +if __name__ == "__main__": + main() diff --git a/scripts/dashboard_generator.py b/scripts/dashboard_generator.py new file mode 100644 index 0000000..a90d27e --- /dev/null +++ b/scripts/dashboard_generator.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +""" +Generate Grafana dashboards from templates. +Supports: web applications, Kubernetes, databases, Redis, and custom metrics. +""" + +import argparse +import sys +import json +from typing import Dict, List, Any, Optional +from pathlib import Path + + +class DashboardGenerator: + def __init__(self, title: str, datasource: str = "Prometheus"): + self.title = title + self.datasource = datasource + self.dashboard = self._create_base_dashboard() + self.panel_id = 1 + self.row_y = 0 + + def _create_base_dashboard(self) -> Dict[str, Any]: + """Create base dashboard structure.""" + return { + "dashboard": { + "title": self.title, + "tags": [], + "timezone": "browser", + "schemaVersion": 16, + "version": 0, + "refresh": "30s", + "panels": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + } + }, + "overwrite": True + } + + def add_variable(self, name: str, label: str, query: str): + """Add a template variable.""" + variable = { + "name": name, + "label": label, + "type": "query", + "datasource": self.datasource, + "query": query, + "refresh": 1, + "regex": "", + "multi": False, + "includeAll": False + } + self.dashboard["dashboard"]["templating"]["list"].append(variable) + + def add_row(self, title: str): + """Add a row panel.""" + panel = { + "id": self.panel_id, + "type": "row", + "title": title, + "collapsed": False, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": self.row_y} + } + self.dashboard["dashboard"]["panels"].append(panel) + self.panel_id += 1 + self.row_y += 1 + + def add_graph(self, title: str, targets: List[Dict[str, str]], unit: str = "short", + width: int = 12, height: int = 8): + """Add a graph panel.""" + panel = { + "id": self.panel_id, + "type": "graph", + "title": title, + "datasource": self.datasource, + "targets": [ + { + "expr": target["query"], + "legendFormat": target.get("legend", ""), + "refId": chr(65 + i) # A, B, C, etc. + } + for i, target in enumerate(targets) + ], + "gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y}, + "yaxes": [ + {"format": unit, "label": None, "show": True}, + {"format": "short", "label": None, "show": True} + ], + "lines": True, + "fill": 1, + "linewidth": 2, + "legend": { + "show": True, + "alignAsTable": True, + "avg": True, + "current": True, + "max": True, + "min": False, + "total": False, + "values": True + } + } + self.dashboard["dashboard"]["panels"].append(panel) + self.panel_id += 1 + self.row_y += height + + def add_stat(self, title: str, query: str, unit: str = "short", + width: int = 6, height: int = 4): + """Add a stat panel (single value).""" + panel = { + "id": self.panel_id, + "type": "stat", + "title": title, + "datasource": self.datasource, + "targets": [ + { + "expr": query, + "refId": "A" + } + ], + "gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y}, + "options": { + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "values": False, + "calcs": ["lastNotNull"] + } + }, + "fieldConfig": { + "defaults": { + "unit": unit, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": None, "color": "green"}, + {"value": 80, "color": "red"} + ] + } + } + } + } + self.dashboard["dashboard"]["panels"].append(panel) + self.panel_id += 1 + + def generate_webapp_dashboard(self, service: str): + """Generate dashboard for web application.""" + self.add_variable("service", "Service", f"label_values({service}_http_requests_total, service)") + + # Request metrics + self.add_row("Request Metrics") + + self.add_graph( + "Request Rate", + [{"query": f'sum(rate({service}_http_requests_total[5m])) by (status)', "legend": "{{status}}"}], + unit="reqps", + width=12 + ) + + self.add_graph( + "Request Latency (p50, p95, p99)", + [ + {"query": f'histogram_quantile(0.50, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p50"}, + {"query": f'histogram_quantile(0.95, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p95"}, + {"query": f'histogram_quantile(0.99, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p99"} + ], + unit="s", + width=12 + ) + + # Error rate + self.add_row("Errors") + + self.add_graph( + "Error Rate (%)", + [{"query": f'sum(rate({service}_http_requests_total{{status=~"5.."}}[5m])) / sum(rate({service}_http_requests_total[5m])) * 100', "legend": "Error Rate"}], + unit="percent", + width=12 + ) + + # Resource usage + self.add_row("Resource Usage") + + self.add_graph( + "CPU Usage", + [{"query": f'sum(rate(process_cpu_seconds_total{{job="{service}"}}[5m])) * 100', "legend": "CPU %"}], + unit="percent", + width=12 + ) + + self.add_graph( + "Memory Usage", + [{"query": f'process_resident_memory_bytes{{job="{service}"}}', "legend": "Memory"}], + unit="bytes", + width=12 + ) + + def generate_kubernetes_dashboard(self, namespace: str): + """Generate dashboard for Kubernetes cluster.""" + self.add_variable("namespace", "Namespace", f"label_values(kube_pod_info, namespace)") + + # Cluster overview + self.add_row("Cluster Overview") + + self.add_stat("Total Pods", f'count(kube_pod_info{{namespace="{namespace}"}})', width=6) + self.add_stat("Running Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Running"}})', width=6) + self.add_stat("Pending Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Pending"}})', width=6) + self.add_stat("Failed Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Failed"}})', width=6) + + # Resource usage + self.add_row("Resource Usage") + + self.add_graph( + "CPU Usage by Pod", + [{"query": f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "{{pod}}"}], + unit="percent", + width=12 + ) + + self.add_graph( + "Memory Usage by Pod", + [{"query": f'sum(container_memory_usage_bytes{{namespace="{namespace}"}}) by (pod)', "legend": "{{pod}}"}], + unit="bytes", + width=12 + ) + + # Network + self.add_row("Network") + + self.add_graph( + "Network I/O", + [ + {"query": f'sum(rate(container_network_receive_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Receive - {{pod}}"}, + {"query": f'sum(rate(container_network_transmit_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Transmit - {{pod}}"} + ], + unit="Bps", + width=12 + ) + + def generate_database_dashboard(self, db_type: str, instance: str): + """Generate dashboard for database (postgres/mysql).""" + if db_type == "postgres": + self._generate_postgres_dashboard(instance) + elif db_type == "mysql": + self._generate_mysql_dashboard(instance) + + def _generate_postgres_dashboard(self, instance: str): + """Generate PostgreSQL dashboard.""" + self.add_row("PostgreSQL Metrics") + + self.add_graph( + "Connections", + [ + {"query": f'pg_stat_database_numbackends{{instance="{instance}"}}', "legend": "{{datname}}"} + ], + unit="short", + width=12 + ) + + self.add_graph( + "Transactions per Second", + [ + {"query": f'rate(pg_stat_database_xact_commit{{instance="{instance}"}}[5m])', "legend": "Commits"}, + {"query": f'rate(pg_stat_database_xact_rollback{{instance="{instance}"}}[5m])', "legend": "Rollbacks"} + ], + unit="tps", + width=12 + ) + + self.add_graph( + "Query Duration (p95)", + [ + {"query": f'histogram_quantile(0.95, rate(pg_stat_statements_total_time_bucket{{instance="{instance}"}}[5m]))', "legend": "p95"} + ], + unit="ms", + width=12 + ) + + def _generate_mysql_dashboard(self, instance: str): + """Generate MySQL dashboard.""" + self.add_row("MySQL Metrics") + + self.add_graph( + "Connections", + [ + {"query": f'mysql_global_status_threads_connected{{instance="{instance}"}}', "legend": "Connected"}, + {"query": f'mysql_global_status_threads_running{{instance="{instance}"}}', "legend": "Running"} + ], + unit="short", + width=12 + ) + + self.add_graph( + "Queries per Second", + [ + {"query": f'rate(mysql_global_status_queries{{instance="{instance}"}}[5m])', "legend": "Queries"} + ], + unit="qps", + width=12 + ) + + def save(self, output_file: str): + """Save dashboard to file.""" + try: + with open(output_file, 'w') as f: + json.dump(self.dashboard, f, indent=2) + return True + except Exception as e: + print(f"❌ Error saving dashboard: {e}") + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Generate Grafana dashboards from templates", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Web application dashboard + python3 dashboard_generator.py webapp \\ + --title "My API Dashboard" \\ + --service my_api \\ + --output dashboard.json + + # Kubernetes dashboard + python3 dashboard_generator.py kubernetes \\ + --title "K8s Namespace" \\ + --namespace production \\ + --output k8s-dashboard.json + + # Database dashboard + python3 dashboard_generator.py database \\ + --title "PostgreSQL" \\ + --db-type postgres \\ + --instance db.example.com:5432 \\ + --output db-dashboard.json + """ + ) + + parser.add_argument('type', choices=['webapp', 'kubernetes', 'database'], + help='Dashboard type') + parser.add_argument('--title', required=True, help='Dashboard title') + parser.add_argument('--output', required=True, help='Output file path') + parser.add_argument('--datasource', default='Prometheus', help='Data source name') + + # Web app specific + parser.add_argument('--service', help='Service name (for webapp)') + + # Kubernetes specific + parser.add_argument('--namespace', help='Kubernetes namespace') + + # Database specific + parser.add_argument('--db-type', choices=['postgres', 'mysql'], help='Database type') + parser.add_argument('--instance', help='Database instance') + + args = parser.parse_args() + + print(f"🎨 Generating {args.type} dashboard: {args.title}") + + generator = DashboardGenerator(args.title, args.datasource) + + if args.type == 'webapp': + if not args.service: + print("❌ --service required for webapp dashboard") + sys.exit(1) + generator.generate_webapp_dashboard(args.service) + + elif args.type == 'kubernetes': + if not args.namespace: + print("❌ --namespace required for kubernetes dashboard") + sys.exit(1) + generator.generate_kubernetes_dashboard(args.namespace) + + elif args.type == 'database': + if not args.db_type or not args.instance: + print("❌ --db-type and --instance required for database dashboard") + sys.exit(1) + generator.generate_database_dashboard(args.db_type, args.instance) + + if generator.save(args.output): + print(f"✅ Dashboard saved to: {args.output}") + print(f"\n📝 Import to Grafana:") + print(f" 1. Go to Grafana → Dashboards → Import") + print(f" 2. Upload {args.output}") + print(f" 3. Select datasource and save") + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/datadog_cost_analyzer.py b/scripts/datadog_cost_analyzer.py new file mode 100644 index 0000000..a748266 --- /dev/null +++ b/scripts/datadog_cost_analyzer.py @@ -0,0 +1,477 @@ +#!/usr/bin/env python3 +""" +Analyze Datadog usage and identify cost optimization opportunities. +Helps find waste in custom metrics, logs, APM, and infrastructure monitoring. +""" + +import argparse +import sys +import os +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from collections import defaultdict + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +class DatadogCostAnalyzer: + # Pricing (as of 2024-2025) + PRICING = { + 'infrastructure_pro': 15, # per host per month + 'infrastructure_enterprise': 23, + 'custom_metric': 0.01, # per metric per month (first 100 free per host) + 'log_ingestion': 0.10, # per GB ingested per month + 'apm_host': 31, # APM Pro per host per month + 'apm_span': 1.70, # per million indexed spans + } + + def __init__(self, api_key: str, app_key: str, site: str = "datadoghq.com"): + self.api_key = api_key + self.app_key = app_key + self.site = site + self.base_url = f"https://api.{site}" + self.headers = { + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key, + 'Content-Type': 'application/json' + } + + def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict: + """Make API request to Datadog.""" + try: + url = f"{self.base_url}{endpoint}" + response = requests.get(url, headers=self.headers, params=params, timeout=30) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"❌ API Error: {e}") + return {} + + def get_usage_metrics(self, start_date: str, end_date: str) -> Dict[str, Any]: + """Get usage metrics for specified date range.""" + endpoint = "/api/v1/usage/summary" + params = { + 'start_month': start_date, + 'end_month': end_date, + 'include_org_details': 'true' + } + + data = self._make_request(endpoint, params) + return data.get('usage', []) + + def get_custom_metrics(self) -> Dict[str, Any]: + """Get custom metrics usage and identify high-cardinality metrics.""" + endpoint = "/api/v1/usage/timeseries" + + # Get last 30 days + end_date = datetime.now() + start_date = end_date - timedelta(days=30) + + params = { + 'start_hr': int(start_date.timestamp()), + 'end_hr': int(end_date.timestamp()) + } + + data = self._make_request(endpoint, params) + + if not data: + return {'metrics': [], 'total_count': 0} + + # Extract custom metrics info + usage_data = data.get('usage', []) + + metrics_summary = { + 'total_custom_metrics': 0, + 'avg_custom_metrics': 0, + 'billable_metrics': 0 + } + + for day in usage_data: + if 'timeseries' in day: + for ts in day['timeseries']: + if ts.get('metric_category') == 'custom': + metrics_summary['total_custom_metrics'] = max( + metrics_summary['total_custom_metrics'], + ts.get('num_custom_timeseries', 0) + ) + + # Calculate billable (first 100 free) + metrics_summary['billable_metrics'] = max(0, metrics_summary['total_custom_metrics'] - 100) + + return metrics_summary + + def get_infrastructure_hosts(self) -> Dict[str, Any]: + """Get infrastructure host count and breakdown.""" + endpoint = "/api/v1/usage/hosts" + + end_date = datetime.now() + start_date = end_date - timedelta(days=30) + + params = { + 'start_hr': int(start_date.timestamp()), + 'end_hr': int(end_date.timestamp()) + } + + data = self._make_request(endpoint, params) + + if not data: + return {'total_hosts': 0} + + usage = data.get('usage', []) + + host_summary = { + 'total_hosts': 0, + 'agent_hosts': 0, + 'aws_hosts': 0, + 'azure_hosts': 0, + 'gcp_hosts': 0, + 'container_count': 0 + } + + for day in usage: + host_summary['total_hosts'] = max(host_summary['total_hosts'], day.get('host_count', 0)) + host_summary['agent_hosts'] = max(host_summary['agent_hosts'], day.get('agent_host_count', 0)) + host_summary['aws_hosts'] = max(host_summary['aws_hosts'], day.get('aws_host_count', 0)) + host_summary['azure_hosts'] = max(host_summary['azure_hosts'], day.get('azure_host_count', 0)) + host_summary['gcp_hosts'] = max(host_summary['gcp_hosts'], day.get('gcp_host_count', 0)) + host_summary['container_count'] = max(host_summary['container_count'], day.get('container_count', 0)) + + return host_summary + + def get_log_usage(self) -> Dict[str, Any]: + """Get log ingestion and retention usage.""" + endpoint = "/api/v1/usage/logs" + + end_date = datetime.now() + start_date = end_date - timedelta(days=30) + + params = { + 'start_hr': int(start_date.timestamp()), + 'end_hr': int(end_date.timestamp()) + } + + data = self._make_request(endpoint, params) + + if not data: + return {'total_gb': 0, 'daily_avg_gb': 0} + + usage = data.get('usage', []) + + total_ingested = 0 + days_count = len(usage) + + for day in usage: + total_ingested += day.get('ingested_events_bytes', 0) + + total_gb = total_ingested / (1024**3) # Convert to GB + daily_avg_gb = total_gb / max(days_count, 1) + + return { + 'total_gb': total_gb, + 'daily_avg_gb': daily_avg_gb, + 'monthly_projected_gb': daily_avg_gb * 30 + } + + def get_unused_monitors(self) -> List[Dict[str, Any]]: + """Find monitors that haven't alerted in 30+ days.""" + endpoint = "/api/v1/monitor" + + data = self._make_request(endpoint) + + if not data: + return [] + + monitors = data if isinstance(data, list) else [] + + unused = [] + now = datetime.now() + + for monitor in monitors: + # Check if monitor has triggered recently + overall_state = monitor.get('overall_state') + modified = monitor.get('modified', '') + + # If monitor has been in OK state and not modified in 30+ days + try: + if modified: + mod_date = datetime.fromisoformat(modified.replace('Z', '+00:00')) + days_since_modified = (now - mod_date.replace(tzinfo=None)).days + + if days_since_modified > 30 and overall_state in ['OK', 'No Data']: + unused.append({ + 'name': monitor.get('name', 'Unknown'), + 'id': monitor.get('id'), + 'days_since_modified': days_since_modified, + 'state': overall_state + }) + except: + pass + + return unused + + def calculate_costs(self, usage_data: Dict[str, Any]) -> Dict[str, float]: + """Calculate estimated monthly costs.""" + costs = { + 'infrastructure': 0, + 'custom_metrics': 0, + 'logs': 0, + 'apm': 0, + 'total': 0 + } + + # Infrastructure (assuming Pro tier) + if 'hosts' in usage_data: + costs['infrastructure'] = usage_data['hosts'].get('total_hosts', 0) * self.PRICING['infrastructure_pro'] + + # Custom metrics + if 'custom_metrics' in usage_data: + billable = usage_data['custom_metrics'].get('billable_metrics', 0) + costs['custom_metrics'] = billable * self.PRICING['custom_metric'] + + # Logs + if 'logs' in usage_data: + monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0) + costs['logs'] = monthly_gb * self.PRICING['log_ingestion'] + + costs['total'] = sum(costs.values()) + + return costs + + def get_recommendations(self, usage_data: Dict[str, Any]) -> List[str]: + """Generate cost optimization recommendations.""" + recommendations = [] + + # Custom metrics recommendations + if 'custom_metrics' in usage_data: + billable = usage_data['custom_metrics'].get('billable_metrics', 0) + if billable > 500: + savings = (billable * 0.3) * self.PRICING['custom_metric'] # Assume 30% reduction possible + recommendations.append({ + 'category': 'Custom Metrics', + 'issue': f'High custom metric count: {billable:,} billable metrics', + 'action': 'Review metric tags for high cardinality, consider aggregating or dropping unused metrics', + 'potential_savings': f'${savings:.2f}/month' + }) + + # Container vs VM recommendations + if 'hosts' in usage_data: + hosts = usage_data['hosts'].get('total_hosts', 0) + containers = usage_data['hosts'].get('container_count', 0) + + if containers > hosts * 10: # Many containers per host + savings = hosts * 0.2 * self.PRICING['infrastructure_pro'] + recommendations.append({ + 'category': 'Infrastructure', + 'issue': f'{containers:,} containers running on {hosts} hosts', + 'action': 'Consider using container monitoring instead of host-based (can be 50-70% cheaper)', + 'potential_savings': f'${savings:.2f}/month' + }) + + # Unused monitors + if 'unused_monitors' in usage_data: + count = len(usage_data['unused_monitors']) + if count > 10: + recommendations.append({ + 'category': 'Monitors', + 'issue': f'{count} monitors unused for 30+ days', + 'action': 'Delete or disable unused monitors to reduce noise and improve performance', + 'potential_savings': 'Operational efficiency' + }) + + # Log volume recommendations + if 'logs' in usage_data: + monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0) + if monthly_gb > 100: + savings = (monthly_gb * 0.4) * self.PRICING['log_ingestion'] # 40% reduction + recommendations.append({ + 'category': 'Logs', + 'issue': f'High log volume: {monthly_gb:.1f} GB/month projected', + 'action': 'Review log sources, implement sampling for debug logs, exclude health checks', + 'potential_savings': f'${savings:.2f}/month' + }) + + # Migration recommendation if costs are high + costs = self.calculate_costs(usage_data) + if costs['total'] > 5000: + oss_cost = usage_data['hosts'].get('total_hosts', 0) * 15 # Rough estimate for self-hosted + savings = costs['total'] - oss_cost + recommendations.append({ + 'category': 'Strategic', + 'issue': f'Total monthly cost: ${costs["total"]:.2f}', + 'action': 'Consider migrating to open-source stack (Prometheus + Grafana + Loki)', + 'potential_savings': f'${savings:.2f}/month (~{(savings/costs["total"]*100):.0f}% reduction)' + }) + + return recommendations + + +def print_usage_summary(usage_data: Dict[str, Any]): + """Print usage summary.""" + print("\n" + "="*70) + print("📊 DATADOG USAGE SUMMARY") + print("="*70) + + # Infrastructure + if 'hosts' in usage_data: + hosts = usage_data['hosts'] + print(f"\n🖥️ Infrastructure:") + print(f" Total Hosts: {hosts.get('total_hosts', 0):,}") + print(f" Agent Hosts: {hosts.get('agent_hosts', 0):,}") + print(f" AWS Hosts: {hosts.get('aws_hosts', 0):,}") + print(f" Azure Hosts: {hosts.get('azure_hosts', 0):,}") + print(f" GCP Hosts: {hosts.get('gcp_hosts', 0):,}") + print(f" Containers: {hosts.get('container_count', 0):,}") + + # Custom Metrics + if 'custom_metrics' in usage_data: + metrics = usage_data['custom_metrics'] + print(f"\n📈 Custom Metrics:") + print(f" Total: {metrics.get('total_custom_metrics', 0):,}") + print(f" Billable: {metrics.get('billable_metrics', 0):,} (first 100 free)") + + # Logs + if 'logs' in usage_data: + logs = usage_data['logs'] + print(f"\n📝 Logs:") + print(f" Daily Average: {logs.get('daily_avg_gb', 0):.2f} GB") + print(f" Monthly Projected: {logs.get('monthly_projected_gb', 0):.2f} GB") + + # Unused Monitors + if 'unused_monitors' in usage_data: + print(f"\n🔔 Unused Monitors:") + print(f" Count: {len(usage_data['unused_monitors'])}") + + +def print_cost_breakdown(costs: Dict[str, float]): + """Print cost breakdown.""" + print("\n" + "="*70) + print("💰 ESTIMATED MONTHLY COSTS") + print("="*70) + + print(f"\n Infrastructure Monitoring: ${costs['infrastructure']:,.2f}") + print(f" Custom Metrics: ${costs['custom_metrics']:,.2f}") + print(f" Log Management: ${costs['logs']:,.2f}") + print(f" APM: ${costs['apm']:,.2f}") + print(f" " + "-"*40) + print(f" TOTAL: ${costs['total']:,.2f}/month") + print(f" ${costs['total']*12:,.2f}/year") + + +def print_recommendations(recommendations: List[Dict]): + """Print recommendations.""" + print("\n" + "="*70) + print("💡 COST OPTIMIZATION RECOMMENDATIONS") + print("="*70) + + total_savings = 0 + + for i, rec in enumerate(recommendations, 1): + print(f"\n{i}. {rec['category']}") + print(f" Issue: {rec['issue']}") + print(f" Action: {rec['action']}") + print(f" Potential Savings: {rec['potential_savings']}") + + # Extract savings amount if it's a dollar value + if '$' in rec['potential_savings']: + try: + amount = float(rec['potential_savings'].replace('$', '').replace('/month', '').replace(',', '')) + total_savings += amount + except: + pass + + if total_savings > 0: + print(f"\n{'='*70}") + print(f"💵 Total Potential Monthly Savings: ${total_savings:,.2f}") + print(f"💵 Total Potential Annual Savings: ${total_savings*12:,.2f}") + print(f"{'='*70}") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze Datadog usage and identify cost optimization opportunities", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Analyze current usage + python3 datadog_cost_analyzer.py \\ + --api-key DD_API_KEY \\ + --app-key DD_APP_KEY + + # Use environment variables + export DD_API_KEY=your_api_key + export DD_APP_KEY=your_app_key + python3 datadog_cost_analyzer.py + + # Specify site (for EU) + python3 datadog_cost_analyzer.py --site datadoghq.eu + +Required Datadog Permissions: + - usage_read + - monitors_read + """ + ) + + parser.add_argument('--api-key', + default=os.environ.get('DD_API_KEY'), + help='Datadog API key (or set DD_API_KEY env var)') + parser.add_argument('--app-key', + default=os.environ.get('DD_APP_KEY'), + help='Datadog Application key (or set DD_APP_KEY env var)') + parser.add_argument('--site', + default='datadoghq.com', + help='Datadog site (default: datadoghq.com, EU: datadoghq.eu)') + + args = parser.parse_args() + + if not args.api_key or not args.app_key: + print("❌ Error: API key and Application key required") + print(" Set via --api-key and --app-key flags or DD_API_KEY and DD_APP_KEY env vars") + sys.exit(1) + + print("🔍 Analyzing Datadog usage...") + print(" This may take 30-60 seconds...\n") + + analyzer = DatadogCostAnalyzer(args.api_key, args.app_key, args.site) + + # Gather usage data + usage_data = {} + + print(" ⏳ Fetching infrastructure usage...") + usage_data['hosts'] = analyzer.get_infrastructure_hosts() + + print(" ⏳ Fetching custom metrics...") + usage_data['custom_metrics'] = analyzer.get_custom_metrics() + + print(" ⏳ Fetching log usage...") + usage_data['logs'] = analyzer.get_log_usage() + + print(" ⏳ Finding unused monitors...") + usage_data['unused_monitors'] = analyzer.get_unused_monitors() + + # Calculate costs + costs = analyzer.calculate_costs(usage_data) + + # Generate recommendations + recommendations = analyzer.get_recommendations(usage_data) + + # Print results + print_usage_summary(usage_data) + print_cost_breakdown(costs) + print_recommendations(recommendations) + + print("\n" + "="*70) + print("✅ Analysis complete!") + print("="*70) + + +if __name__ == "__main__": + main() diff --git a/scripts/health_check_validator.py b/scripts/health_check_validator.py new file mode 100644 index 0000000..1be4fc9 --- /dev/null +++ b/scripts/health_check_validator.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +""" +Validate health check endpoints and analyze response quality. +Checks: response time, status code, response format, dependencies. +""" + +import argparse +import sys +import time +import json +from typing import Dict, List, Any, Optional +from urllib.parse import urlparse + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + + +class HealthCheckValidator: + def __init__(self, timeout: int = 5): + self.timeout = timeout + self.results = [] + + def validate_endpoint(self, url: str) -> Dict[str, Any]: + """Validate a health check endpoint.""" + result = { + "url": url, + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "checks": [], + "warnings": [], + "errors": [] + } + + try: + # Make request + start_time = time.time() + response = requests.get(url, timeout=self.timeout, verify=True) + response_time = time.time() - start_time + + result["status_code"] = response.status_code + result["response_time"] = response_time + + # Check 1: Status code + if response.status_code == 200: + result["checks"].append("✅ Status code is 200") + else: + result["errors"].append(f"❌ Unexpected status code: {response.status_code} (expected 200)") + + # Check 2: Response time + if response_time < 1.0: + result["checks"].append(f"✅ Response time: {response_time:.3f}s (< 1s)") + elif response_time < 3.0: + result["warnings"].append(f"⚠️ Slow response time: {response_time:.3f}s (should be < 1s)") + else: + result["errors"].append(f"❌ Very slow response time: {response_time:.3f}s (should be < 1s)") + + # Check 3: Content type + content_type = response.headers.get('Content-Type', '') + if 'application/json' in content_type: + result["checks"].append("✅ Content-Type is application/json") + + # Try to parse JSON + try: + data = response.json() + result["response_data"] = data + + # Check for common health check fields + self._validate_json_structure(data, result) + + except json.JSONDecodeError: + result["errors"].append("❌ Invalid JSON response") + elif 'text/plain' in content_type: + result["warnings"].append("⚠️ Content-Type is text/plain (JSON recommended)") + result["response_data"] = response.text + else: + result["warnings"].append(f"⚠️ Unexpected Content-Type: {content_type}") + + # Check 4: Response headers + self._validate_headers(response.headers, result) + + except requests.exceptions.Timeout: + result["errors"].append(f"❌ Request timeout (> {self.timeout}s)") + result["status_code"] = None + result["response_time"] = None + + except requests.exceptions.ConnectionError: + result["errors"].append("❌ Connection error (endpoint unreachable)") + result["status_code"] = None + result["response_time"] = None + + except requests.exceptions.SSLError: + result["errors"].append("❌ SSL certificate validation failed") + result["status_code"] = None + result["response_time"] = None + + except Exception as e: + result["errors"].append(f"❌ Unexpected error: {str(e)}") + result["status_code"] = None + result["response_time"] = None + + # Overall status + if result["errors"]: + result["overall_status"] = "UNHEALTHY" + elif result["warnings"]: + result["overall_status"] = "DEGRADED" + else: + result["overall_status"] = "HEALTHY" + + return result + + def _validate_json_structure(self, data: Dict[str, Any], result: Dict[str, Any]): + """Validate JSON health check structure.""" + # Check for status field + if "status" in data: + status = data["status"] + if status in ["ok", "healthy", "up", "pass"]: + result["checks"].append(f"✅ Status field present: '{status}'") + else: + result["warnings"].append(f"⚠️ Status field has unexpected value: '{status}'") + else: + result["warnings"].append("⚠️ Missing 'status' field (recommended)") + + # Check for version/build info + if any(key in data for key in ["version", "build", "commit", "timestamp"]): + result["checks"].append("✅ Version/build information present") + else: + result["warnings"].append("⚠️ No version/build information (recommended)") + + # Check for dependencies + if "dependencies" in data or "checks" in data or "components" in data: + result["checks"].append("✅ Dependency checks present") + + # Validate dependency structure + deps = data.get("dependencies") or data.get("checks") or data.get("components") + if isinstance(deps, dict): + unhealthy_deps = [] + for name, info in deps.items(): + if isinstance(info, dict): + dep_status = info.get("status", "unknown") + if dep_status not in ["ok", "healthy", "up", "pass"]: + unhealthy_deps.append(name) + elif isinstance(info, str): + if info not in ["ok", "healthy", "up", "pass"]: + unhealthy_deps.append(name) + + if unhealthy_deps: + result["warnings"].append(f"⚠️ Unhealthy dependencies: {', '.join(unhealthy_deps)}") + else: + result["checks"].append(f"✅ All dependencies healthy ({len(deps)} checked)") + else: + result["warnings"].append("⚠️ No dependency checks (recommended for production services)") + + # Check for uptime/metrics + if any(key in data for key in ["uptime", "metrics", "stats"]): + result["checks"].append("✅ Metrics/stats present") + + def _validate_headers(self, headers: Dict[str, str], result: Dict[str, Any]): + """Validate response headers.""" + # Check for caching headers + cache_control = headers.get('Cache-Control', '') + if 'no-cache' in cache_control or 'no-store' in cache_control: + result["checks"].append("✅ Caching disabled (Cache-Control: no-cache)") + else: + result["warnings"].append("⚠️ Caching not explicitly disabled (add Cache-Control: no-cache)") + + def validate_multiple(self, urls: List[str]) -> List[Dict[str, Any]]: + """Validate multiple health check endpoints.""" + results = [] + for url in urls: + print(f"🔍 Checking: {url}") + result = self.validate_endpoint(url) + results.append(result) + return results + + +def print_result(result: Dict[str, Any], verbose: bool = False): + """Print validation result.""" + status_emoji = { + "HEALTHY": "✅", + "DEGRADED": "⚠️", + "UNHEALTHY": "❌" + } + + print("\n" + "="*60) + emoji = status_emoji.get(result["overall_status"], "❓") + print(f"{emoji} {result['overall_status']}: {result['url']}") + print("="*60) + + if result.get("status_code"): + print(f"\n📊 Status Code: {result['status_code']}") + print(f"⏱️ Response Time: {result['response_time']:.3f}s") + + # Print checks + if result["checks"]: + print(f"\n✅ Passed Checks:") + for check in result["checks"]: + print(f" {check}") + + # Print warnings + if result["warnings"]: + print(f"\n⚠️ Warnings:") + for warning in result["warnings"]: + print(f" {warning}") + + # Print errors + if result["errors"]: + print(f"\n❌ Errors:") + for error in result["errors"]: + print(f" {error}") + + # Print response data if verbose + if verbose and "response_data" in result: + print(f"\n📄 Response Data:") + if isinstance(result["response_data"], dict): + print(json.dumps(result["response_data"], indent=2)) + else: + print(result["response_data"]) + + print("="*60) + + +def print_summary(results: List[Dict[str, Any]]): + """Print summary of multiple validations.""" + print("\n" + "="*60) + print("📊 HEALTH CHECK VALIDATION SUMMARY") + print("="*60) + + healthy = sum(1 for r in results if r["overall_status"] == "HEALTHY") + degraded = sum(1 for r in results if r["overall_status"] == "DEGRADED") + unhealthy = sum(1 for r in results if r["overall_status"] == "UNHEALTHY") + + print(f"\n✅ Healthy: {healthy}/{len(results)}") + print(f"⚠️ Degraded: {degraded}/{len(results)}") + print(f"❌ Unhealthy: {unhealthy}/{len(results)}") + + if results: + avg_response_time = sum(r.get("response_time", 0) for r in results if r.get("response_time")) / len(results) + print(f"\n⏱️ Average Response Time: {avg_response_time:.3f}s") + + print("="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Validate health check endpoints", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check a single endpoint + python3 health_check_validator.py https://api.example.com/health + + # Check multiple endpoints + python3 health_check_validator.py \\ + https://api.example.com/health \\ + https://api.example.com/readiness + + # Verbose output with response data + python3 health_check_validator.py https://api.example.com/health --verbose + + # Custom timeout + python3 health_check_validator.py https://api.example.com/health --timeout 10 + +Best Practices Checked: + ✓ Returns 200 status code + ✓ Response time < 1 second + ✓ Returns JSON format + ✓ Contains 'status' field + ✓ Includes version/build info + ✓ Checks dependencies + ✓ Includes metrics + ✓ Disables caching + """ + ) + + parser.add_argument('urls', nargs='+', help='Health check endpoint URL(s)') + parser.add_argument('--timeout', type=int, default=5, help='Request timeout in seconds (default: 5)') + parser.add_argument('--verbose', action='store_true', help='Show detailed response data') + + args = parser.parse_args() + + validator = HealthCheckValidator(timeout=args.timeout) + + results = validator.validate_multiple(args.urls) + + # Print individual results + for result in results: + print_result(result, args.verbose) + + # Print summary if multiple endpoints + if len(results) > 1: + print_summary(results) + + +if __name__ == "__main__": + main() diff --git a/scripts/log_analyzer.py b/scripts/log_analyzer.py new file mode 100644 index 0000000..a4f7803 --- /dev/null +++ b/scripts/log_analyzer.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +""" +Parse and analyze logs for patterns, errors, and anomalies. +Supports: error detection, frequency analysis, pattern matching. +""" + +import argparse +import sys +import re +import json +from collections import Counter, defaultdict +from datetime import datetime +from typing import Dict, List, Any, Optional +from pathlib import Path + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +class LogAnalyzer: + # Common log level patterns + LOG_LEVELS = { + 'ERROR': r'\b(ERROR|Error|error)\b', + 'WARN': r'\b(WARN|Warning|warn|warning)\b', + 'INFO': r'\b(INFO|Info|info)\b', + 'DEBUG': r'\b(DEBUG|Debug|debug)\b', + 'FATAL': r'\b(FATAL|Fatal|fatal|CRITICAL|Critical)\b' + } + + # Common error patterns + ERROR_PATTERNS = { + 'exception': r'Exception|exception|EXCEPTION', + 'stack_trace': r'\s+at\s+.*\(.*:\d+\)', + 'http_error': r'\b[45]\d{2}\b', # 4xx and 5xx HTTP codes + 'timeout': r'timeout|timed out|TIMEOUT', + 'connection_refused': r'connection refused|ECONNREFUSED', + 'out_of_memory': r'OutOfMemoryError|OOM|out of memory', + 'null_pointer': r'NullPointerException|null pointer|NPE', + 'database_error': r'SQLException|database error|DB error' + } + + def __init__(self, log_file: str): + self.log_file = log_file + self.lines = [] + self.log_levels = Counter() + self.error_patterns = Counter() + self.timestamps = [] + + def parse_file(self) -> bool: + """Parse log file.""" + try: + with open(self.log_file, 'r', encoding='utf-8', errors='ignore') as f: + self.lines = f.readlines() + return True + except Exception as e: + print(f"❌ Error reading file: {e}") + return False + + def analyze_log_levels(self): + """Count log levels.""" + for line in self.lines: + for level, pattern in self.LOG_LEVELS.items(): + if re.search(pattern, line): + self.log_levels[level] += 1 + break # Count each line only once + + def analyze_error_patterns(self): + """Detect common error patterns.""" + for line in self.lines: + for pattern_name, pattern in self.ERROR_PATTERNS.items(): + if re.search(pattern, line, re.IGNORECASE): + self.error_patterns[pattern_name] += 1 + + def extract_timestamps(self, timestamp_pattern: Optional[str] = None): + """Extract timestamps from logs.""" + if not timestamp_pattern: + # Common timestamp patterns + patterns = [ + r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}', # ISO format + r'\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}', # Apache format + r'\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', # Syslog format + ] + else: + patterns = [timestamp_pattern] + + for line in self.lines: + for pattern in patterns: + match = re.search(pattern, line) + if match: + self.timestamps.append(match.group()) + break + + def find_error_lines(self, context: int = 2) -> List[Dict[str, Any]]: + """Find error lines with context.""" + errors = [] + + for i, line in enumerate(self.lines): + # Check if line contains error keywords + is_error = any(re.search(pattern, line, re.IGNORECASE) + for pattern in [self.LOG_LEVELS['ERROR'], self.LOG_LEVELS['FATAL']]) + + if is_error: + # Get context lines + start = max(0, i - context) + end = min(len(self.lines), i + context + 1) + context_lines = self.lines[start:end] + + errors.append({ + 'line_number': i + 1, + 'line': line.strip(), + 'context': ''.join(context_lines) + }) + + return errors + + def analyze_frequency(self, time_window_minutes: int = 5) -> Dict[str, Any]: + """Analyze log frequency over time.""" + if not self.timestamps: + return {"error": "No timestamps found"} + + # This is a simplified version - in production you'd parse actual timestamps + total_lines = len(self.lines) + if self.timestamps: + time_span = len(self.timestamps) + avg_per_window = total_lines / max(1, time_span / time_window_minutes) + else: + avg_per_window = 0 + + return { + "total_lines": total_lines, + "timestamps_found": len(self.timestamps), + "avg_per_window": avg_per_window + } + + def extract_unique_messages(self, pattern: str) -> List[str]: + """Extract unique messages matching a pattern.""" + matches = [] + seen = set() + + for line in self.lines: + match = re.search(pattern, line, re.IGNORECASE) + if match: + msg = match.group() if match.lastindex is None else match.group(1) + if msg not in seen: + matches.append(msg) + seen.add(msg) + + return matches + + def find_stack_traces(self) -> List[Dict[str, Any]]: + """Extract complete stack traces.""" + stack_traces = [] + current_trace = [] + in_trace = False + + for i, line in enumerate(self.lines): + # Start of stack trace + if re.search(r'Exception|Error.*:', line): + if current_trace: + stack_traces.append({ + 'line_start': i - len(current_trace) + 1, + 'trace': '\n'.join(current_trace) + }) + current_trace = [line.strip()] + in_trace = True + # Stack trace continuation + elif in_trace and re.search(r'^\s+at\s+', line): + current_trace.append(line.strip()) + # End of stack trace + elif in_trace: + if current_trace: + stack_traces.append({ + 'line_start': i - len(current_trace) + 1, + 'trace': '\n'.join(current_trace) + }) + current_trace = [] + in_trace = False + + # Add last trace if exists + if current_trace: + stack_traces.append({ + 'line_start': len(self.lines) - len(current_trace) + 1, + 'trace': '\n'.join(current_trace) + }) + + return stack_traces + + +def print_analysis_results(analyzer: LogAnalyzer, show_errors: bool = False, + show_traces: bool = False): + """Print analysis results.""" + print("\n" + "="*60) + print("📝 LOG ANALYSIS RESULTS") + print("="*60) + + print(f"\n📁 File: {analyzer.log_file}") + print(f"📊 Total Lines: {len(analyzer.lines):,}") + + # Log levels + if analyzer.log_levels: + print(f"\n{'='*60}") + print("📊 LOG LEVEL DISTRIBUTION:") + print(f"{'='*60}") + + level_emoji = { + 'FATAL': '🔴', + 'ERROR': '❌', + 'WARN': '⚠️', + 'INFO': 'ℹ️', + 'DEBUG': '🐛' + } + + for level, count in analyzer.log_levels.most_common(): + emoji = level_emoji.get(level, '•') + percentage = (count / len(analyzer.lines)) * 100 + print(f"{emoji} {level:10s}: {count:6,} ({percentage:5.1f}%)") + + # Error patterns + if analyzer.error_patterns: + print(f"\n{'='*60}") + print("🔍 ERROR PATTERNS DETECTED:") + print(f"{'='*60}") + + for pattern, count in analyzer.error_patterns.most_common(10): + print(f"• {pattern:20s}: {count:,} occurrences") + + # Timestamps + if analyzer.timestamps: + print(f"\n{'='*60}") + print(f"⏰ Timestamps Found: {len(analyzer.timestamps):,}") + print(f" First: {analyzer.timestamps[0]}") + print(f" Last: {analyzer.timestamps[-1]}") + + # Error lines + if show_errors: + errors = analyzer.find_error_lines(context=1) + if errors: + print(f"\n{'='*60}") + print(f"❌ ERROR LINES (showing first 10 of {len(errors)}):") + print(f"{'='*60}") + + for error in errors[:10]: + print(f"\nLine {error['line_number']}:") + print(f" {error['line']}") + + # Stack traces + if show_traces: + traces = analyzer.find_stack_traces() + if traces: + print(f"\n{'='*60}") + print(f"📚 STACK TRACES FOUND: {len(traces)}") + print(f"{'='*60}") + + for i, trace in enumerate(traces[:5], 1): + print(f"\nTrace {i} (starting at line {trace['line_start']}):") + print(trace['trace']) + if i < len(traces): + print("\n" + "-"*60) + + print("\n" + "="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze log files for errors, patterns, and anomalies", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic analysis + python3 log_analyzer.py application.log + + # Show error lines with context + python3 log_analyzer.py application.log --show-errors + + # Show stack traces + python3 log_analyzer.py application.log --show-traces + + # Full analysis + python3 log_analyzer.py application.log --show-errors --show-traces + +Features: + • Log level distribution (ERROR, WARN, INFO, DEBUG, FATAL) + • Common error pattern detection + • Timestamp extraction + • Error line identification with context + • Stack trace extraction + • Frequency analysis + """ + ) + + parser.add_argument('log_file', help='Path to log file') + parser.add_argument('--show-errors', action='store_true', help='Show error lines') + parser.add_argument('--show-traces', action='store_true', help='Show stack traces') + parser.add_argument('--timestamp-pattern', help='Custom regex for timestamp extraction') + + args = parser.parse_args() + + if not Path(args.log_file).exists(): + print(f"❌ File not found: {args.log_file}") + sys.exit(1) + + print(f"🔍 Analyzing log file: {args.log_file}") + + analyzer = LogAnalyzer(args.log_file) + + if not analyzer.parse_file(): + sys.exit(1) + + # Perform analysis + analyzer.analyze_log_levels() + analyzer.analyze_error_patterns() + analyzer.extract_timestamps(args.timestamp_pattern) + + # Print results + print_analysis_results(analyzer, args.show_errors, args.show_traces) + + +if __name__ == "__main__": + main() diff --git a/scripts/slo_calculator.py b/scripts/slo_calculator.py new file mode 100644 index 0000000..78c38bb --- /dev/null +++ b/scripts/slo_calculator.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +""" +Calculate SLO compliance, error budgets, and burn rates. +Supports availability SLOs and latency SLOs. +""" + +import argparse +import sys +from datetime import datetime, timedelta +from typing import Dict, Any, Optional + +try: + from tabulate import tabulate +except ImportError: + print("⚠️ Warning: 'tabulate' library not found. Install with: pip install tabulate") + tabulate = None + + +class SLOCalculator: + # SLO targets and allowed downtime per period + SLO_TARGETS = { + "90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1}, # days + "95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05}, + "99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01}, + "99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005}, + "99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001}, + "99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005}, + "99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001}, + } + + def __init__(self, slo_target: float, period_days: int = 30): + """ + Initialize SLO calculator. + + Args: + slo_target: SLO target percentage (e.g., 99.9) + period_days: Time period in days (default: 30) + """ + self.slo_target = slo_target + self.period_days = period_days + self.error_budget_minutes = self.calculate_error_budget_minutes() + + def calculate_error_budget_minutes(self) -> float: + """Calculate error budget in minutes for the period.""" + total_minutes = self.period_days * 24 * 60 + allowed_error_rate = (100 - self.slo_target) / 100 + return total_minutes * allowed_error_rate + + def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]: + """ + Calculate availability SLO compliance. + + Args: + total_requests: Total number of requests + failed_requests: Number of failed requests + + Returns: + Dict with SLO compliance metrics + """ + if total_requests == 0: + return { + "error": "No requests in the period", + "slo_met": False + } + + success_rate = ((total_requests - failed_requests) / total_requests) * 100 + error_rate = (failed_requests / total_requests) * 100 + + # Calculate error budget consumption + allowed_failures = total_requests * ((100 - self.slo_target) / 100) + error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf') + error_budget_remaining = max(0, 100 - error_budget_consumed) + + # Determine if SLO is met + slo_met = success_rate >= self.slo_target + + return { + "slo_target": self.slo_target, + "period_days": self.period_days, + "total_requests": total_requests, + "failed_requests": failed_requests, + "success_requests": total_requests - failed_requests, + "success_rate": success_rate, + "error_rate": error_rate, + "slo_met": slo_met, + "error_budget_total": allowed_failures, + "error_budget_consumed": error_budget_consumed, + "error_budget_remaining": error_budget_remaining, + "margin": success_rate - self.slo_target + } + + def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]: + """ + Calculate latency SLO compliance. + + Args: + total_requests: Total number of requests + requests_exceeding_threshold: Number of requests exceeding latency threshold + + Returns: + Dict with SLO compliance metrics + """ + if total_requests == 0: + return { + "error": "No requests in the period", + "slo_met": False + } + + within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100 + + # Calculate error budget consumption + allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100) + error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf') + error_budget_remaining = max(0, 100 - error_budget_consumed) + + slo_met = within_threshold_rate >= self.slo_target + + return { + "slo_target": self.slo_target, + "period_days": self.period_days, + "total_requests": total_requests, + "requests_exceeding_threshold": requests_exceeding_threshold, + "requests_within_threshold": total_requests - requests_exceeding_threshold, + "within_threshold_rate": within_threshold_rate, + "slo_met": slo_met, + "error_budget_total": allowed_slow_requests, + "error_budget_consumed": error_budget_consumed, + "error_budget_remaining": error_budget_remaining, + "margin": within_threshold_rate - self.slo_target + } + + def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]: + """ + Calculate error budget burn rate. + + Args: + errors_in_window: Number of errors in the time window + requests_in_window: Total requests in the time window + window_hours: Size of the time window in hours + + Returns: + Dict with burn rate metrics + """ + if requests_in_window == 0: + return {"error": "No requests in window"} + + # Calculate actual error rate in this window + actual_error_rate = (errors_in_window / requests_in_window) * 100 + + # Calculate allowed error rate for SLO + allowed_error_rate = 100 - self.slo_target + + # Burn rate = actual error rate / allowed error rate + burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf') + + # Calculate time to exhaustion + if burn_rate > 0: + error_budget_hours = self.error_budget_minutes / 60 + hours_to_exhaustion = error_budget_hours / burn_rate + else: + hours_to_exhaustion = float('inf') + + # Determine severity + if burn_rate >= 14.4: # 1 hour window, burns budget in 2 days + severity = "critical" + elif burn_rate >= 6: # 6 hour window, burns budget in 5 days + severity = "warning" + elif burn_rate >= 1: + severity = "elevated" + else: + severity = "normal" + + return { + "window_hours": window_hours, + "requests_in_window": requests_in_window, + "errors_in_window": errors_in_window, + "actual_error_rate": actual_error_rate, + "allowed_error_rate": allowed_error_rate, + "burn_rate": burn_rate, + "hours_to_exhaustion": hours_to_exhaustion, + "severity": severity + } + + @staticmethod + def print_slo_table(): + """Print table of common SLO targets and allowed downtime.""" + if not tabulate: + print("Install tabulate for formatted output: pip install tabulate") + return + + print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME") + print("="*60) + + headers = ["SLO", "Year", "Month", "Week", "Day"] + rows = [] + + for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True): + row = [ + f"{slo}%", + f"{downtimes['year']:.2f} days", + f"{downtimes['month']:.2f} days", + f"{downtimes['week']:.2f} days", + f"{downtimes['day']:.2f} days" + ] + rows.append(row) + + print(tabulate(rows, headers=headers, tablefmt="grid")) + + +def print_availability_results(results: Dict[str, Any]): + """Print availability SLO results.""" + print("\n" + "="*60) + print("📊 AVAILABILITY SLO COMPLIANCE") + print("="*60) + + if "error" in results: + print(f"\n❌ Error: {results['error']}") + return + + status_emoji = "✅" if results['slo_met'] else "❌" + print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}") + print(f" Target: {results['slo_target']}%") + print(f" Actual: {results['success_rate']:.3f}%") + print(f" Margin: {results['margin']:+.3f}%") + + print(f"\n📈 Request Statistics:") + print(f" Total Requests: {results['total_requests']:,}") + print(f" Successful: {results['success_requests']:,}") + print(f" Failed: {results['failed_requests']:,}") + print(f" Error Rate: {results['error_rate']:.3f}%") + + print(f"\n💰 Error Budget:") + budget_emoji = "✅" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else "❌" + print(f" {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%") + print(f" Consumed: {results['error_budget_consumed']:.1f}%") + print(f" Allowed Failures: {results['error_budget_total']:.0f}") + + print("\n" + "="*60) + + +def print_burn_rate_results(results: Dict[str, Any]): + """Print burn rate results.""" + print("\n" + "="*60) + print("🔥 ERROR BUDGET BURN RATE") + print("="*60) + + if "error" in results: + print(f"\n❌ Error: {results['error']}") + return + + severity_emoji = { + "critical": "🔴", + "warning": "🟡", + "elevated": "🟠", + "normal": "🟢" + } + + print(f"\n{severity_emoji.get(results['severity'], '❓')} Severity: {results['severity'].upper()}") + print(f" Burn Rate: {results['burn_rate']:.2f}x") + print(f" Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)") + + print(f"\n📊 Window Statistics:") + print(f" Window: {results['window_hours']} hours") + print(f" Requests: {results['requests_in_window']:,}") + print(f" Errors: {results['errors_in_window']:,}") + print(f" Actual Error Rate: {results['actual_error_rate']:.3f}%") + print(f" Allowed Error Rate: {results['allowed_error_rate']:.3f}%") + + print("\n" + "="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Calculate SLO compliance and error budgets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Show SLO reference table + python3 slo_calculator.py --table + + # Calculate availability SLO + python3 slo_calculator.py availability \\ + --slo 99.9 \\ + --total-requests 1000000 \\ + --failed-requests 1500 \\ + --period-days 30 + + # Calculate latency SLO + python3 slo_calculator.py latency \\ + --slo 99.5 \\ + --total-requests 500000 \\ + --slow-requests 3000 \\ + --period-days 7 + + # Calculate burn rate + python3 slo_calculator.py burn-rate \\ + --slo 99.9 \\ + --errors 50 \\ + --requests 10000 \\ + --window-hours 1 + """ + ) + + parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'], + help='Calculation mode') + parser.add_argument('--table', action='store_true', help='Show SLO reference table') + parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)') + parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)') + + # Availability SLO arguments + parser.add_argument('--total-requests', type=int, help='Total number of requests') + parser.add_argument('--failed-requests', type=int, help='Number of failed requests') + + # Latency SLO arguments + parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold') + + # Burn rate arguments + parser.add_argument('--errors', type=int, help='Number of errors in window') + parser.add_argument('--requests', type=int, help='Number of requests in window') + parser.add_argument('--window-hours', type=float, help='Window size in hours') + + args = parser.parse_args() + + # Show table if requested + if args.table: + SLOCalculator.print_slo_table() + return + + if not args.mode: + parser.print_help() + return + + if not args.slo: + print("❌ --slo required") + sys.exit(1) + + calculator = SLOCalculator(args.slo, args.period_days) + + if args.mode == 'availability': + if not args.total_requests or args.failed_requests is None: + print("❌ --total-requests and --failed-requests required") + sys.exit(1) + + results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests) + print_availability_results(results) + + elif args.mode == 'latency': + if not args.total_requests or args.slow_requests is None: + print("❌ --total-requests and --slow-requests required") + sys.exit(1) + + results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests) + print_availability_results(results) # Same format + + elif args.mode == 'burn-rate': + if not all([args.errors is not None, args.requests, args.window_hours]): + print("❌ --errors, --requests, and --window-hours required") + sys.exit(1) + + results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours) + print_burn_rate_results(results) + + +if __name__ == "__main__": + main()