From 23753b435ebb546ce8ff29f24321588dec004e0e Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 17:51:22 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + SKILL.md | 869 ++++++++++++++++++ .../otel-config/collector-config.yaml | 227 +++++ .../prometheus-alerts/kubernetes-alerts.yml | 293 ++++++ .../prometheus-alerts/webapp-alerts.yml | 243 +++++ .../runbooks/incident-runbook-template.md | 409 +++++++++ monitoring-observability.skill | Bin 0 -> 102073 bytes plugin.lock.json | 125 +++ references/alerting_best_practices.md | 609 ++++++++++++ references/datadog_migration.md | 649 +++++++++++++ references/dql_promql_translation.md | 756 +++++++++++++++ references/logging_guide.md | 775 ++++++++++++++++ references/metrics_design.md | 406 ++++++++ references/slo_sla_guide.md | 652 +++++++++++++ references/tool_comparison.md | 697 ++++++++++++++ references/tracing_guide.md | 663 +++++++++++++ scripts/alert_quality_checker.py | 315 +++++++ scripts/analyze_metrics.py | 279 ++++++ scripts/dashboard_generator.py | 395 ++++++++ scripts/datadog_cost_analyzer.py | 477 ++++++++++ scripts/health_check_validator.py | 297 ++++++ scripts/log_analyzer.py | 321 +++++++ scripts/slo_calculator.py | 365 ++++++++ 24 files changed, 9837 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 SKILL.md create mode 100644 assets/templates/otel-config/collector-config.yaml create mode 100644 assets/templates/prometheus-alerts/kubernetes-alerts.yml create mode 100644 assets/templates/prometheus-alerts/webapp-alerts.yml create mode 100644 assets/templates/runbooks/incident-runbook-template.md create mode 100644 monitoring-observability.skill create mode 100644 plugin.lock.json create mode 100644 references/alerting_best_practices.md create mode 100644 references/datadog_migration.md create mode 100644 references/dql_promql_translation.md create mode 100644 references/logging_guide.md create mode 100644 references/metrics_design.md create mode 100644 references/slo_sla_guide.md create mode 100644 references/tool_comparison.md create mode 100644 references/tracing_guide.md create mode 100644 scripts/alert_quality_checker.py create mode 100644 scripts/analyze_metrics.py create mode 100644 scripts/dashboard_generator.py create mode 100644 scripts/datadog_cost_analyzer.py create mode 100644 scripts/health_check_validator.py create mode 100644 scripts/log_analyzer.py create mode 100644 scripts/slo_calculator.py diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..6cdaf33 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "monitoring-observability", + "description": "Monitoring and observability strategy, metrics/logs/traces systems, SLOs/error budgets, Prometheus/Grafana/Loki, OpenTelemetry, and tool comparison", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Ahmad Asmar", + "email": "zhongweili@tubi.tv" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d417535 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# monitoring-observability + +Monitoring and observability strategy, metrics/logs/traces systems, SLOs/error budgets, Prometheus/Grafana/Loki, OpenTelemetry, and tool comparison diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..e1594d1 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,869 @@ +--- +name: monitoring-observability +description: Monitoring and observability strategy, implementation, and troubleshooting. Use for designing metrics/logs/traces systems, setting up Prometheus/Grafana/Loki, creating alerts and dashboards, calculating SLOs and error budgets, analyzing performance issues, and comparing monitoring tools (Datadog, ELK, CloudWatch). Covers the Four Golden Signals, RED/USE methods, OpenTelemetry instrumentation, log aggregation patterns, and distributed tracing. +--- + +# Monitoring & Observability + +## Overview + +This skill provides comprehensive guidance for monitoring and observability workflows including metrics design, log aggregation, distributed tracing, alerting strategies, SLO/SLA management, and tool selection. + +**When to use this skill**: +- Setting up monitoring for new services +- Designing alerts and dashboards +- Troubleshooting performance issues +- Implementing SLO tracking and error budgets +- Choosing between monitoring tools +- Integrating OpenTelemetry instrumentation +- Analyzing metrics, logs, and traces +- Optimizing Datadog costs and finding waste +- Migrating from Datadog to open-source stack + +--- + +## Core Workflow: Observability Implementation + +Use this decision tree to determine your starting point: + +``` +Are you setting up monitoring from scratch? +├─ YES → Start with "1. Design Metrics Strategy" +└─ NO → Do you have an existing issue? + ├─ YES → Go to "9. Troubleshooting & Analysis" + └─ NO → Are you improving existing monitoring? + ├─ Alerts → Go to "3. Alert Design" + ├─ Dashboards → Go to "4. Dashboard & Visualization" + ├─ SLOs → Go to "5. SLO & Error Budgets" + ├─ Tool selection → Read references/tool_comparison.md + └─ Using Datadog? High costs? → Go to "7. Datadog Cost Optimization & Migration" +``` + +--- + +## 1. Design Metrics Strategy + +### Start with The Four Golden Signals + +Every service should monitor: + +1. **Latency**: Response time (p50, p95, p99) +2. **Traffic**: Requests per second +3. **Errors**: Failure rate +4. **Saturation**: Resource utilization + +**For request-driven services**, use the **RED Method**: +- **R**ate: Requests/sec +- **E**rrors: Error rate +- **D**uration: Response time + +**For infrastructure resources**, use the **USE Method**: +- **U**tilization: % time busy +- **S**aturation**: Queue depth +- **E**rrors**: Error count + +**Quick Start - Web Application Example**: +```promql +# Rate (requests/sec) +sum(rate(http_requests_total[5m])) + +# Errors (error rate %) +sum(rate(http_requests_total{status=~"5.."}[5m])) + / +sum(rate(http_requests_total[5m])) * 100 + +# Duration (p95 latency) +histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) +) +``` + +### Deep Dive: Metric Design + +For comprehensive metric design guidance including: +- Metric types (counter, gauge, histogram, summary) +- Cardinality best practices +- Naming conventions +- Dashboard design principles + +**→ Read**: [references/metrics_design.md](references/metrics_design.md) + +### Automated Metric Analysis + +Detect anomalies and trends in your metrics: + +```bash +# Analyze Prometheus metrics for anomalies +python3 scripts/analyze_metrics.py prometheus \ + --endpoint http://localhost:9090 \ + --query 'rate(http_requests_total[5m])' \ + --hours 24 + +# Analyze CloudWatch metrics +python3 scripts/analyze_metrics.py cloudwatch \ + --namespace AWS/EC2 \ + --metric CPUUtilization \ + --dimensions InstanceId=i-1234567890abcdef0 \ + --hours 48 +``` + +**→ Script**: [scripts/analyze_metrics.py](scripts/analyze_metrics.py) + +--- + +## 2. Log Aggregation & Analysis + +### Structured Logging Checklist + +Every log entry should include: +- ✅ Timestamp (ISO 8601 format) +- ✅ Log level (DEBUG, INFO, WARN, ERROR, FATAL) +- ✅ Message (human-readable) +- ✅ Service name +- ✅ Request ID (for tracing) + +**Example structured log (JSON)**: +```json +{ + "timestamp": "2024-10-28T14:32:15Z", + "level": "error", + "message": "Payment processing failed", + "service": "payment-service", + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "user_id": "user123", + "order_id": "ORD-456", + "error_type": "GatewayTimeout", + "duration_ms": 5000 +} +``` + +### Log Aggregation Patterns + +**ELK Stack** (Elasticsearch, Logstash, Kibana): +- Best for: Deep log analysis, complex queries +- Cost: High (infrastructure + operations) +- Complexity: High + +**Grafana Loki**: +- Best for: Cost-effective logging, Kubernetes +- Cost: Low +- Complexity: Medium + +**CloudWatch Logs**: +- Best for: AWS-centric applications +- Cost: Medium +- Complexity: Low + +### Log Analysis + +Analyze logs for errors, patterns, and anomalies: + +```bash +# Analyze log file for patterns +python3 scripts/log_analyzer.py application.log + +# Show error lines with context +python3 scripts/log_analyzer.py application.log --show-errors + +# Extract stack traces +python3 scripts/log_analyzer.py application.log --show-traces +``` + +**→ Script**: [scripts/log_analyzer.py](scripts/log_analyzer.py) + +### Deep Dive: Logging + +For comprehensive logging guidance including: +- Structured logging implementation examples (Python, Node.js, Go, Java) +- Log aggregation patterns (ELK, Loki, CloudWatch, Fluentd) +- Query patterns and best practices +- PII redaction and security +- Sampling and rate limiting + +**→ Read**: [references/logging_guide.md](references/logging_guide.md) + +--- + +## 3. Alert Design + +### Alert Design Principles + +1. **Every alert must be actionable** - If you can't do something, don't alert +2. **Alert on symptoms, not causes** - Alert on user experience, not components +3. **Tie alerts to SLOs** - Connect to business impact +4. **Reduce noise** - Only page for critical issues + +### Alert Severity Levels + +| Severity | Response Time | Example | +|----------|--------------|---------| +| **Critical** | Page immediately | Service down, SLO violation | +| **Warning** | Ticket, review in hours | Elevated error rate, resource warning | +| **Info** | Log for awareness | Deployment completed, scaling event | + +### Multi-Window Burn Rate Alerting + +Alert when error budget is consumed too quickly: + +```yaml +# Fast burn (1h window) - Critical +- alert: ErrorBudgetFastBurn + expr: | + (error_rate / 0.001) > 14.4 # 99.9% SLO + for: 2m + labels: + severity: critical + +# Slow burn (6h window) - Warning +- alert: ErrorBudgetSlowBurn + expr: | + (error_rate / 0.001) > 6 # 99.9% SLO + for: 30m + labels: + severity: warning +``` + +### Alert Quality Checker + +Audit your alert rules against best practices: + +```bash +# Check single file +python3 scripts/alert_quality_checker.py alerts.yml + +# Check all rules in directory +python3 scripts/alert_quality_checker.py /path/to/prometheus/rules/ +``` + +**Checks for**: +- Alert naming conventions +- Required labels (severity, team) +- Required annotations (summary, description, runbook_url) +- PromQL expression quality +- 'for' clause to prevent flapping + +**→ Script**: [scripts/alert_quality_checker.py](scripts/alert_quality_checker.py) + +### Alert Templates + +Production-ready alert rule templates: + +**→ Templates**: +- [assets/templates/prometheus-alerts/webapp-alerts.yml](assets/templates/prometheus-alerts/webapp-alerts.yml) - Web application alerts +- [assets/templates/prometheus-alerts/kubernetes-alerts.yml](assets/templates/prometheus-alerts/kubernetes-alerts.yml) - Kubernetes alerts + +### Deep Dive: Alerting + +For comprehensive alerting guidance including: +- Alert design patterns (multi-window, rate of change, threshold with hysteresis) +- Alert annotation best practices +- Alert routing (severity-based, team-based, time-based) +- Inhibition rules +- Runbook structure +- On-call best practices + +**→ Read**: [references/alerting_best_practices.md](references/alerting_best_practices.md) + +### Runbook Template + +Create comprehensive runbooks for your alerts: + +**→ Template**: [assets/templates/runbooks/incident-runbook-template.md](assets/templates/runbooks/incident-runbook-template.md) + +--- + +## 4. Dashboard & Visualization + +### Dashboard Design Principles + +1. **Top-down layout**: Most important metrics first +2. **Color coding**: Red (critical), yellow (warning), green (healthy) +3. **Consistent time windows**: All panels use same time range +4. **Limit panels**: 8-12 panels per dashboard maximum +5. **Include context**: Show related metrics together + +### Recommended Dashboard Structure + +``` +┌─────────────────────────────────────┐ +│ Overall Health (Single Stats) │ +│ [Requests/s] [Error%] [P95 Latency]│ +└─────────────────────────────────────┘ +┌─────────────────────────────────────┐ +│ Request Rate & Errors (Graphs) │ +└─────────────────────────────────────┘ +┌─────────────────────────────────────┐ +│ Latency Distribution (Graphs) │ +└─────────────────────────────────────┘ +┌─────────────────────────────────────┐ +│ Resource Usage (Graphs) │ +└─────────────────────────────────────┘ +``` + +### Generate Grafana Dashboards + +Automatically generate dashboards from templates: + +```bash +# Web application dashboard +python3 scripts/dashboard_generator.py webapp \ + --title "My API Dashboard" \ + --service my_api \ + --output dashboard.json + +# Kubernetes dashboard +python3 scripts/dashboard_generator.py kubernetes \ + --title "K8s Production" \ + --namespace production \ + --output k8s-dashboard.json + +# Database dashboard +python3 scripts/dashboard_generator.py database \ + --title "PostgreSQL" \ + --db-type postgres \ + --instance db.example.com:5432 \ + --output db-dashboard.json +``` + +**Supports**: +- Web applications (requests, errors, latency, resources) +- Kubernetes (pods, nodes, resources, network) +- Databases (PostgreSQL, MySQL) + +**→ Script**: [scripts/dashboard_generator.py](scripts/dashboard_generator.py) + +--- + +## 5. SLO & Error Budgets + +### SLO Fundamentals + +**SLI** (Service Level Indicator): Measurement of service quality +- Example: Request latency, error rate, availability + +**SLO** (Service Level Objective): Target value for an SLI +- Example: "99.9% of requests return in < 500ms" + +**Error Budget**: Allowed failure amount = (100% - SLO) +- Example: 99.9% SLO = 0.1% error budget = 43.2 minutes/month + +### Common SLO Targets + +| Availability | Downtime/Month | Use Case | +|--------------|----------------|----------| +| **99%** | 7.2 hours | Internal tools | +| **99.9%** | 43.2 minutes | Standard production | +| **99.95%** | 21.6 minutes | Critical services | +| **99.99%** | 4.3 minutes | High availability | + +### SLO Calculator + +Calculate compliance, error budgets, and burn rates: + +```bash +# Show SLO reference table +python3 scripts/slo_calculator.py --table + +# Calculate availability SLO +python3 scripts/slo_calculator.py availability \ + --slo 99.9 \ + --total-requests 1000000 \ + --failed-requests 1500 \ + --period-days 30 + +# Calculate burn rate +python3 scripts/slo_calculator.py burn-rate \ + --slo 99.9 \ + --errors 50 \ + --requests 10000 \ + --window-hours 1 +``` + +**→ Script**: [scripts/slo_calculator.py](scripts/slo_calculator.py) + +### Deep Dive: SLO/SLA + +For comprehensive SLO/SLA guidance including: +- Choosing appropriate SLIs +- Setting realistic SLO targets +- Error budget policies +- Burn rate alerting +- SLA structure and contracts +- Monthly reporting templates + +**→ Read**: [references/slo_sla_guide.md](references/slo_sla_guide.md) + +--- + +## 6. Distributed Tracing + +### When to Use Tracing + +Use distributed tracing when you need to: +- Debug performance issues across services +- Understand request flow through microservices +- Identify bottlenecks in distributed systems +- Find N+1 query problems + +### OpenTelemetry Implementation + +**Python example**: +```python +from opentelemetry import trace + +tracer = trace.get_tracer(__name__) + +@tracer.start_as_current_span("process_order") +def process_order(order_id): + span = trace.get_current_span() + span.set_attribute("order.id", order_id) + + try: + result = payment_service.charge(order_id) + span.set_attribute("payment.status", "success") + return result + except Exception as e: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + span.record_exception(e) + raise +``` + +### Sampling Strategies + +- **Development**: 100% (ALWAYS_ON) +- **Staging**: 50-100% +- **Production**: 1-10% (or error-based sampling) + +**Error-based sampling** (always sample errors, 1% of successes): +```python +class ErrorSampler(Sampler): + def should_sample(self, parent_context, trace_id, name, **kwargs): + attributes = kwargs.get('attributes', {}) + + if attributes.get('error', False): + return Decision.RECORD_AND_SAMPLE + + if trace_id & 0xFF < 3: # ~1% + return Decision.RECORD_AND_SAMPLE + + return Decision.DROP +``` + +### OTel Collector Configuration + +Production-ready OpenTelemetry Collector configuration: + +**→ Template**: [assets/templates/otel-config/collector-config.yaml](assets/templates/otel-config/collector-config.yaml) + +**Features**: +- Receives OTLP, Prometheus, and host metrics +- Batching and memory limiting +- Tail sampling (error-based, latency-based, probabilistic) +- Multiple exporters (Tempo, Jaeger, Loki, Prometheus, CloudWatch, Datadog) + +### Deep Dive: Tracing + +For comprehensive tracing guidance including: +- OpenTelemetry instrumentation (Python, Node.js, Go, Java) +- Span attributes and semantic conventions +- Context propagation (W3C Trace Context) +- Backend comparison (Jaeger, Tempo, X-Ray, Datadog APM) +- Analysis patterns (finding slow traces, N+1 queries) +- Integration with logs + +**→ Read**: [references/tracing_guide.md](references/tracing_guide.md) + +--- + +## 7. Datadog Cost Optimization & Migration + +### Scenario 1: I'm Using Datadog and Costs Are Too High + +If your Datadog bill is growing out of control, start by identifying waste: + +#### Cost Analysis Script + +Automatically analyze your Datadog usage and find cost optimization opportunities: + +```bash +# Analyze Datadog usage (requires API key and APP key) +python3 scripts/datadog_cost_analyzer.py \ + --api-key $DD_API_KEY \ + --app-key $DD_APP_KEY + +# Show detailed breakdown by category +python3 scripts/datadog_cost_analyzer.py \ + --api-key $DD_API_KEY \ + --app-key $DD_APP_KEY \ + --show-details +``` + +**What it checks**: +- Infrastructure host count and cost +- Custom metrics usage and high-cardinality metrics +- Log ingestion volume and trends +- APM host usage +- Unused or noisy monitors +- Container vs VM optimization opportunities + +**→ Script**: [scripts/datadog_cost_analyzer.py](scripts/datadog_cost_analyzer.py) + +#### Common Cost Optimization Strategies + +**1. Custom Metrics Optimization** (typical savings: 20-40%): +- Remove high-cardinality tags (user IDs, request IDs) +- Delete unused custom metrics +- Aggregate metrics before sending +- Use metric prefixes to identify teams/services + +**2. Log Management** (typical savings: 30-50%): +- Implement log sampling for high-volume services +- Use exclusion filters for debug/trace logs in production +- Archive cold logs to S3/GCS after 7 days +- Set log retention policies (15 days instead of 30) + +**3. APM Optimization** (typical savings: 15-25%): +- Reduce trace sampling rates (10% → 5% in prod) +- Use head-based sampling instead of complete sampling +- Remove APM from non-critical services +- Use trace search with lower retention + +**4. Infrastructure Monitoring** (typical savings: 10-20%): +- Switch from VM-based to container-based pricing where possible +- Remove agents from ephemeral instances +- Use Datadog's host reduction strategies +- Consolidate staging environments + +### Scenario 2: Migrating Away from Datadog + +If you're considering migrating to a more cost-effective open-source stack: + +#### Migration Overview + +**From Datadog** → **To Open Source Stack**: +- Metrics: Datadog → **Prometheus + Grafana** +- Logs: Datadog Logs → **Grafana Loki** +- Traces: Datadog APM → **Tempo or Jaeger** +- Dashboards: Datadog → **Grafana** +- Alerts: Datadog Monitors → **Prometheus Alertmanager** + +**Estimated Cost Savings**: 60-77% ($49.8k-61.8k/year for 100-host environment) + +#### Migration Strategy + +**Phase 1: Run Parallel** (Month 1-2): +- Deploy open-source stack alongside Datadog +- Migrate metrics first (lowest risk) +- Validate data accuracy + +**Phase 2: Migrate Dashboards & Alerts** (Month 2-3): +- Convert Datadog dashboards to Grafana +- Translate alert rules (use DQL → PromQL guide below) +- Train team on new tools + +**Phase 3: Migrate Logs & Traces** (Month 3-4): +- Set up Loki for log aggregation +- Deploy Tempo/Jaeger for tracing +- Update application instrumentation + +**Phase 4: Decommission Datadog** (Month 4-5): +- Confirm all functionality migrated +- Cancel Datadog subscription + +#### Query Translation: DQL → PromQL + +When migrating dashboards and alerts, you'll need to translate Datadog queries to PromQL: + +**Quick examples**: +``` +# Average CPU +Datadog: avg:system.cpu.user{*} +Prometheus: avg(node_cpu_seconds_total{mode="user"}) + +# Request rate +Datadog: sum:requests.count{*}.as_rate() +Prometheus: sum(rate(http_requests_total[5m])) + +# P95 latency +Datadog: p95:request.duration{*} +Prometheus: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +# Error rate percentage +Datadog: (sum:requests.errors{*}.as_rate() / sum:requests.count{*}.as_rate()) * 100 +Prometheus: (sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100 +``` + +**→ Full Translation Guide**: [references/dql_promql_translation.md](references/dql_promql_translation.md) + +#### Cost Comparison + +**Example: 100-host infrastructure** + +| Component | Datadog (Annual) | Open Source (Annual) | Savings | +|-----------|-----------------|---------------------|---------| +| Infrastructure | $18,000 | $10,000 (self-hosted infra) | $8,000 | +| Custom Metrics | $600 | Included | $600 | +| Logs | $24,000 | $3,000 (storage) | $21,000 | +| APM/Traces | $37,200 | $5,000 (storage) | $32,200 | +| **Total** | **$79,800** | **$18,000** | **$61,800 (77%)** | + +### Deep Dive: Datadog Migration + +For comprehensive migration guidance including: +- Detailed cost comparison and ROI calculations +- Step-by-step migration instructions +- Infrastructure sizing recommendations (CPU, RAM, storage) +- Dashboard conversion tools and examples +- Alert rule translation patterns +- Application instrumentation changes (DogStatsD → Prometheus client) +- Python scripts for exporting Datadog dashboards and monitors +- Common challenges and solutions + +**→ Read**: [references/datadog_migration.md](references/datadog_migration.md) + +--- + +## 8. Tool Selection & Comparison + +### Decision Matrix + +**Choose Prometheus + Grafana if**: +- ✅ Using Kubernetes +- ✅ Want control and customization +- ✅ Have ops capacity +- ✅ Budget-conscious + +**Choose Datadog if**: +- ✅ Want ease of use +- ✅ Need full observability now +- ✅ Budget allows ($8k+/month for 100 hosts) + +**Choose Grafana Stack (LGTM) if**: +- ✅ Want open source full stack +- ✅ Cost-effective solution +- ✅ Cloud-native architecture + +**Choose ELK Stack if**: +- ✅ Heavy log analysis needs +- ✅ Need powerful search +- ✅ Have dedicated ops team + +**Choose Cloud Native (CloudWatch/etc) if**: +- ✅ Single cloud provider +- ✅ Simple needs +- ✅ Want minimal setup + +### Cost Comparison (100 hosts, 1TB logs/month) + +| Solution | Monthly Cost | Setup | Ops Burden | +|----------|-------------|--------|------------| +| Prometheus + Loki + Tempo | $1,500 | Medium | Medium | +| Grafana Cloud | $3,000 | Low | Low | +| Datadog | $8,000 | Low | None | +| ELK Stack | $4,000 | High | High | +| CloudWatch | $2,000 | Low | Low | + +### Deep Dive: Tool Comparison + +For comprehensive tool comparison including: +- Metrics platforms (Prometheus, Datadog, New Relic, CloudWatch, Grafana Cloud) +- Logging platforms (ELK, Loki, Splunk, CloudWatch Logs, Sumo Logic) +- Tracing platforms (Jaeger, Tempo, Datadog APM, X-Ray) +- Full-stack observability comparison +- Recommendations by company size + +**→ Read**: [references/tool_comparison.md](references/tool_comparison.md) + +--- + +## 9. Troubleshooting & Analysis + +### Health Check Validation + +Validate health check endpoints against best practices: + +```bash +# Check single endpoint +python3 scripts/health_check_validator.py https://api.example.com/health + +# Check multiple endpoints +python3 scripts/health_check_validator.py \ + https://api.example.com/health \ + https://api.example.com/readiness \ + --verbose +``` + +**Checks for**: +- ✓ Returns 200 status code +- ✓ Response time < 1 second +- ✓ Returns JSON format +- ✓ Contains 'status' field +- ✓ Includes version/build info +- ✓ Checks dependencies +- ✓ Disables caching + +**→ Script**: [scripts/health_check_validator.py](scripts/health_check_validator.py) + +### Common Troubleshooting Workflows + +**High Latency Investigation**: +1. Check dashboards for latency spike +2. Query traces for slow operations +3. Check database slow query log +4. Check external API response times +5. Review recent deployments +6. Check resource utilization (CPU, memory) + +**High Error Rate Investigation**: +1. Check error logs for patterns +2. Identify affected endpoints +3. Check dependency health +4. Review recent deployments +5. Check resource limits +6. Verify configuration + +**Service Down Investigation**: +1. Check if pods/instances are running +2. Check health check endpoint +3. Review recent deployments +4. Check resource availability +5. Check network connectivity +6. Review logs for startup errors + +--- + +## Quick Reference Commands + +### Prometheus Queries + +```promql +# Request rate +sum(rate(http_requests_total[5m])) + +# Error rate +sum(rate(http_requests_total{status=~"5.."}[5m])) + / +sum(rate(http_requests_total[5m])) * 100 + +# P95 latency +histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) +) + +# CPU usage +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Memory usage +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 +``` + +### Kubernetes Commands + +```bash +# Check pod status +kubectl get pods -n + +# View pod logs +kubectl logs -f -n + +# Check pod resources +kubectl top pods -n + +# Describe pod for events +kubectl describe pod -n + +# Check recent deployments +kubectl rollout history deployment/ -n +``` + +### Log Queries + +**Elasticsearch**: +```json +GET /logs-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "level": "error" } }, + { "range": { "@timestamp": { "gte": "now-1h" } } } + ] + } + } +} +``` + +**Loki (LogQL)**: +```logql +{job="app", level="error"} |= "error" | json +``` + +**CloudWatch Insights**: +``` +fields @timestamp, level, message +| filter level = "error" +| filter @timestamp > ago(1h) +``` + +--- + +## Resources Summary + +### Scripts (automation and analysis) +- `analyze_metrics.py` - Detect anomalies in Prometheus/CloudWatch metrics +- `alert_quality_checker.py` - Audit alert rules against best practices +- `slo_calculator.py` - Calculate SLO compliance and error budgets +- `log_analyzer.py` - Parse logs for errors and patterns +- `dashboard_generator.py` - Generate Grafana dashboards from templates +- `health_check_validator.py` - Validate health check endpoints +- `datadog_cost_analyzer.py` - Analyze Datadog usage and find cost waste + +### References (deep-dive documentation) +- `metrics_design.md` - Four Golden Signals, RED/USE methods, metric types +- `alerting_best_practices.md` - Alert design, runbooks, on-call practices +- `logging_guide.md` - Structured logging, aggregation patterns +- `tracing_guide.md` - OpenTelemetry, distributed tracing +- `slo_sla_guide.md` - SLI/SLO/SLA definitions, error budgets +- `tool_comparison.md` - Comprehensive comparison of monitoring tools +- `datadog_migration.md` - Complete guide for migrating from Datadog to OSS stack +- `dql_promql_translation.md` - Datadog Query Language to PromQL translation reference + +### Templates (ready-to-use configurations) +- `prometheus-alerts/webapp-alerts.yml` - Production-ready web app alerts +- `prometheus-alerts/kubernetes-alerts.yml` - Kubernetes monitoring alerts +- `otel-config/collector-config.yaml` - OpenTelemetry Collector configuration +- `runbooks/incident-runbook-template.md` - Incident response template + +--- + +## Best Practices + +### Metrics +- Start with Four Golden Signals +- Use appropriate metric types (counter, gauge, histogram) +- Keep cardinality low (avoid high-cardinality labels) +- Follow naming conventions + +### Logging +- Use structured logging (JSON) +- Include request IDs for tracing +- Set appropriate log levels +- Redact PII before logging + +### Alerting +- Make every alert actionable +- Alert on symptoms, not causes +- Use multi-window burn rate alerts +- Include runbook links + +### Tracing +- Sample appropriately (1-10% in production) +- Always record errors +- Use semantic conventions +- Propagate context between services + +### SLOs +- Start with current performance +- Set realistic targets +- Define error budget policies +- Review and adjust quarterly diff --git a/assets/templates/otel-config/collector-config.yaml b/assets/templates/otel-config/collector-config.yaml new file mode 100644 index 0000000..2a94676 --- /dev/null +++ b/assets/templates/otel-config/collector-config.yaml @@ -0,0 +1,227 @@ +# OpenTelemetry Collector Configuration +# Receives metrics, logs, and traces and exports to various backends + +receivers: + # OTLP receiver (standard OpenTelemetry protocol) + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + # Prometheus receiver (scrape Prometheus endpoints) + prometheus: + config: + scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 30s + static_configs: + - targets: ['localhost:8888'] + + # Host metrics (CPU, memory, disk, network) + hostmetrics: + collection_interval: 30s + scrapers: + cpu: + memory: + disk: + network: + filesystem: + load: + + # Kubernetes receiver (cluster metrics) + k8s_cluster: + auth_type: serviceAccount + node_conditions_to_report: [Ready, MemoryPressure, DiskPressure] + distribution: kubernetes + + # Zipkin receiver (legacy tracing) + zipkin: + endpoint: 0.0.0.0:9411 + +processors: + # Batch processor (improves performance) + batch: + timeout: 10s + send_batch_size: 1024 + send_batch_max_size: 2048 + + # Memory limiter (prevent OOM) + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + + # Resource processor (add resource attributes) + resource: + attributes: + - key: environment + value: production + action: insert + - key: cluster.name + value: prod-cluster + action: insert + + # Attributes processor (modify span/metric attributes) + attributes: + actions: + - key: http.url + action: delete # Remove potentially sensitive URLs + - key: db.statement + action: hash # Hash SQL queries for privacy + + # Filter processor (drop unwanted data) + filter: + metrics: + # Drop metrics matching criteria + exclude: + match_type: regexp + metric_names: + - ^go_.* # Drop Go runtime metrics + - ^process_.* # Drop process metrics + + # Tail sampling (intelligent trace sampling) + tail_sampling: + decision_wait: 10s + num_traces: 100 + policies: + # Always sample errors + - name: error-policy + type: status_code + status_code: + status_codes: [ERROR] + + # Sample slow traces + - name: latency-policy + type: latency + latency: + threshold_ms: 1000 + + # Sample 10% of others + - name: probabilistic-policy + type: probabilistic + probabilistic: + sampling_percentage: 10 + + # Span processor (modify spans) + span: + name: + to_attributes: + rules: + - ^\/api\/v1\/users\/(?P.*)$ + from_attributes: + - db.name + - http.method + +exporters: + # Prometheus exporter (expose metrics endpoint) + prometheus: + endpoint: 0.0.0.0:8889 + namespace: otel + + # OTLP exporters (send to backends) + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + otlp/mimir: + endpoint: mimir:4317 + tls: + insecure: true + + # Loki exporter (for logs) + loki: + endpoint: http://loki:3100/loki/api/v1/push + labels: + resource: + service.name: "service_name" + service.namespace: "service_namespace" + attributes: + level: "level" + + # Jaeger exporter (alternative tracing backend) + jaeger: + endpoint: jaeger:14250 + tls: + insecure: true + + # Elasticsearch exporter (for logs) + elasticsearch: + endpoints: + - http://elasticsearch:9200 + logs_index: otel-logs + traces_index: otel-traces + + # CloudWatch exporter (AWS) + awscloudwatch: + region: us-east-1 + namespace: MyApp + log_group_name: /aws/otel/logs + log_stream_name: otel-collector + + # Datadog exporter + datadog: + api: + key: ${DD_API_KEY} + site: datadoghq.com + + # File exporter (debugging) + file: + path: /tmp/otel-output.json + + # Logging exporter (console output for debugging) + logging: + verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 + +extensions: + # Health check endpoint + health_check: + endpoint: 0.0.0.0:13133 + + # Pprof endpoint (for profiling) + pprof: + endpoint: 0.0.0.0:1777 + + # ZPages (internal diagnostics) + zpages: + endpoint: 0.0.0.0:55679 + +service: + extensions: [health_check, pprof, zpages] + + pipelines: + # Traces pipeline + traces: + receivers: [otlp, zipkin] + processors: [memory_limiter, batch, tail_sampling, resource, span] + exporters: [otlp/tempo, jaeger, logging] + + # Metrics pipeline + metrics: + receivers: [otlp, prometheus, hostmetrics, k8s_cluster] + processors: [memory_limiter, batch, filter, resource] + exporters: [otlp/mimir, prometheus, awscloudwatch] + + # Logs pipeline + logs: + receivers: [otlp] + processors: [memory_limiter, batch, resource, attributes] + exporters: [loki, elasticsearch, awscloudwatch] + + # Telemetry (collector's own metrics) + telemetry: + logs: + level: info + metrics: + address: 0.0.0.0:8888 + +# Notes: +# 1. Replace ${DD_API_KEY} with actual API key or use environment variable +# 2. Adjust endpoints to match your infrastructure +# 3. Comment out exporters you don't use +# 4. Adjust sampling rates based on your volume and needs +# 5. Add TLS configuration for production deployments diff --git a/assets/templates/prometheus-alerts/kubernetes-alerts.yml b/assets/templates/prometheus-alerts/kubernetes-alerts.yml new file mode 100644 index 0000000..adac0c6 --- /dev/null +++ b/assets/templates/prometheus-alerts/kubernetes-alerts.yml @@ -0,0 +1,293 @@ +--- +# Prometheus Alert Rules for Kubernetes +# Covers pods, nodes, deployments, and resource usage + +groups: + - name: kubernetes_pods + interval: 30s + rules: + # Pod crash looping + - alert: PodCrashLooping + expr: | + rate(kube_pod_container_status_restarts_total[15m]) > 0 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Pod is crash looping - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes. + + Check pod logs: + kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} --previous + runbook_url: "https://runbooks.example.com/pod-crash-loop" + + # Pod not ready + - alert: PodNotReady + expr: | + sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Pod not ready - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Pod {{ $labels.namespace }}/{{ $labels.pod }} is in {{ $labels.phase }} state for 10 minutes. + + Investigate: + kubectl describe pod -n {{ $labels.namespace }} {{ $labels.pod }} + runbook_url: "https://runbooks.example.com/pod-not-ready" + + # Pod OOMKilled + - alert: PodOOMKilled + expr: | + sum by (namespace, pod) (kube_pod_container_status_terminated_reason{reason="OOMKilled"}) > 0 + for: 1m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Pod killed due to OOM - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Pod {{ $labels.namespace }}/{{ $labels.pod }} was killed due to out-of-memory. + + Increase memory limits or investigate memory leak. + runbook_url: "https://runbooks.example.com/oom-killed" + + - name: kubernetes_deployments + interval: 30s + rules: + # Deployment replica mismatch + - alert: DeploymentReplicasMismatch + expr: | + kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 15m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Deployment replicas mismatch - {{ $labels.namespace }}/{{ $labels.deployment }}" + description: | + Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has been running with + fewer replicas than desired for 15 minutes. + + Desired: {{ $value }} + Available: Check deployment status + runbook_url: "https://runbooks.example.com/replica-mismatch" + + # Deployment rollout stuck + - alert: DeploymentRolloutStuck + expr: | + kube_deployment_status_condition{condition="Progressing", status="false"} > 0 + for: 15m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Deployment rollout stuck - {{ $labels.namespace }}/{{ $labels.deployment }}" + description: | + Deployment {{ $labels.namespace }}/{{ $labels.deployment }} rollout is stuck. + + Check rollout status: + kubectl rollout status deployment/{{ $labels.deployment }} -n {{ $labels.namespace }} + runbook_url: "https://runbooks.example.com/rollout-stuck" + + - name: kubernetes_nodes + interval: 30s + rules: + # Node not ready + - alert: NodeNotReady + expr: | + kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: critical + team: platform + component: kubernetes + annotations: + summary: "Node not ready - {{ $labels.node }}" + description: | + Node {{ $labels.node }} has been NotReady for 5 minutes. + + This will affect pod scheduling and availability. + + Check node status: + kubectl describe node {{ $labels.node }} + runbook_url: "https://runbooks.example.com/node-not-ready" + + # Node memory pressure + - alert: NodeMemoryPressure + expr: | + kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Node under memory pressure - {{ $labels.node }}" + description: | + Node {{ $labels.node }} is experiencing memory pressure. + + Pods may be evicted. Consider scaling up or evicting low-priority pods. + runbook_url: "https://runbooks.example.com/memory-pressure" + + # Node disk pressure + - alert: NodeDiskPressure + expr: | + kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Node under disk pressure - {{ $labels.node }}" + description: | + Node {{ $labels.node }} is experiencing disk pressure. + + Clean up disk space or add capacity. + runbook_url: "https://runbooks.example.com/disk-pressure" + + # Node high CPU + - alert: NodeHighCPU + expr: | + (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) * 100 > 80 + for: 15m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Node high CPU usage - {{ $labels.instance }}" + description: | + Node {{ $labels.instance }} CPU usage is {{ $value | humanize }}%. + + Check for resource-intensive pods or scale cluster. + runbook_url: "https://runbooks.example.com/node-high-cpu" + + - name: kubernetes_resources + interval: 30s + rules: + # Container CPU throttling + - alert: ContainerCPUThrottling + expr: | + rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Container CPU throttling - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} + is being CPU throttled. + + CPU throttling rate: {{ $value | humanize }} + + Consider increasing CPU limits. + runbook_url: "https://runbooks.example.com/cpu-throttling" + + # Container memory usage high + - alert: ContainerMemoryUsageHigh + expr: | + (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Container memory usage high - {{ $labels.namespace }}/{{ $labels.pod }}" + description: | + Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} + is using {{ $value | humanizePercentage }} of its memory limit. + + Risk of OOMKill. Consider increasing memory limits. + runbook_url: "https://runbooks.example.com/high-memory" + + - name: kubernetes_pv + interval: 30s + rules: + # PersistentVolume nearing full + - alert: PersistentVolumeFillingUp + expr: | + (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.15 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "PersistentVolume filling up - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}" + description: | + PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} + is {{ $value | humanizePercentage }} full. + + Available space is running low. Consider expanding volume. + runbook_url: "https://runbooks.example.com/pv-filling-up" + + # PersistentVolume critically full + - alert: PersistentVolumeCriticallyFull + expr: | + (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.05 + for: 5m + labels: + severity: critical + team: platform + component: kubernetes + annotations: + summary: "PersistentVolume critically full - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}" + description: | + PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} + is {{ $value | humanizePercentage }} full. + + Immediate action required to prevent application failures. + runbook_url: "https://runbooks.example.com/pv-critically-full" + + - name: kubernetes_jobs + interval: 30s + rules: + # Job failed + - alert: JobFailed + expr: | + kube_job_status_failed > 0 + for: 5m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "Job failed - {{ $labels.namespace }}/{{ $labels.job_name }}" + description: | + Job {{ $labels.namespace }}/{{ $labels.job_name }} has failed. + + Check job logs: + kubectl logs job/{{ $labels.job_name }} -n {{ $labels.namespace }} + runbook_url: "https://runbooks.example.com/job-failed" + + # CronJob not running + - alert: CronJobNotRunning + expr: | + time() - kube_cronjob_status_last_schedule_time > 3600 + for: 10m + labels: + severity: warning + team: platform + component: kubernetes + annotations: + summary: "CronJob not running - {{ $labels.namespace }}/{{ $labels.cronjob }}" + description: | + CronJob {{ $labels.namespace}}/{{ $labels.cronjob }} hasn't run in over an hour. + + Check CronJob status: + kubectl describe cronjob {{ $labels.cronjob }} -n {{ $labels.namespace }} + runbook_url: "https://runbooks.example.com/cronjob-not-running" diff --git a/assets/templates/prometheus-alerts/webapp-alerts.yml b/assets/templates/prometheus-alerts/webapp-alerts.yml new file mode 100644 index 0000000..f7e596e --- /dev/null +++ b/assets/templates/prometheus-alerts/webapp-alerts.yml @@ -0,0 +1,243 @@ +--- +# Prometheus Alert Rules for Web Applications +# Based on SLO best practices and multi-window burn rate alerting + +groups: + - name: webapp_availability + interval: 30s + rules: + # Fast burn rate alert (1h window) - SLO: 99.9% + - alert: ErrorBudgetFastBurn + expr: | + ( + sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h])) + / + sum(rate(http_requests_total{job="webapp"}[1h])) + ) > (14.4 * 0.001) + for: 2m + labels: + severity: critical + team: backend + component: webapp + annotations: + summary: "Fast error budget burn - {{ $labels.job }}" + description: | + Error rate is {{ $value | humanizePercentage }} over the last hour, + burning through error budget at 14.4x rate. + + At this rate, the monthly error budget will be exhausted in 2 days. + + Immediate investigation required. + runbook_url: "https://runbooks.example.com/error-budget-burn" + dashboard: "https://grafana.example.com/d/webapp" + + # Slow burn rate alert (6h window) + - alert: ErrorBudgetSlowBurn + expr: | + ( + sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h])) + / + sum(rate(http_requests_total{job="webapp"}[6h])) + ) > (6 * 0.001) + for: 30m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "Elevated error budget burn - {{ $labels.job }}" + description: | + Error rate is {{ $value | humanizePercentage }} over the last 6 hours, + burning through error budget at 6x rate. + + Monitor closely and investigate if trend continues. + runbook_url: "https://runbooks.example.com/error-budget-burn" + + # Service down alert + - alert: WebAppDown + expr: up{job="webapp"} == 0 + for: 2m + labels: + severity: critical + team: backend + component: webapp + annotations: + summary: "Web application is down - {{ $labels.instance }}" + description: | + Web application instance {{ $labels.instance }} has been down for 2 minutes. + + Check service health and logs immediately. + runbook_url: "https://runbooks.example.com/service-down" + + - name: webapp_latency + interval: 30s + rules: + # High latency (p95) + - alert: HighLatencyP95 + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le) + ) > 0.5 + for: 10m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "High p95 latency - {{ $labels.job }}" + description: | + P95 request latency is {{ $value }}s, exceeding 500ms threshold. + + This may impact user experience. Check for: + - Slow database queries + - External API issues + - Resource saturation + runbook_url: "https://runbooks.example.com/high-latency" + dashboard: "https://grafana.example.com/d/webapp-latency" + + # Very high latency (p99) + - alert: HighLatencyP99 + expr: | + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le) + ) > 2 + for: 5m + labels: + severity: critical + team: backend + component: webapp + annotations: + summary: "Critical latency degradation - {{ $labels.job }}" + description: | + P99 request latency is {{ $value }}s, exceeding 2s threshold. + + Severe performance degradation detected. + runbook_url: "https://runbooks.example.com/high-latency" + + - name: webapp_resources + interval: 30s + rules: + # High CPU + - alert: HighCPU + expr: | + rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80 + for: 15m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "High CPU usage - {{ $labels.instance }}" + description: | + CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}. + + Consider scaling up or investigating CPU-intensive operations. + runbook_url: "https://runbooks.example.com/high-cpu" + + # High memory + - alert: HighMemory + expr: | + (process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80 + for: 15m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "High memory usage - {{ $labels.instance }}" + description: | + Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}. + + Check for memory leaks or consider scaling up. + runbook_url: "https://runbooks.example.com/high-memory" + + - name: webapp_traffic + interval: 30s + rules: + # Traffic spike + - alert: TrafficSpike + expr: | + sum(rate(http_requests_total{job="webapp"}[5m])) + > + 1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h)) + for: 10m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "Traffic spike detected - {{ $labels.job }}" + description: | + Request rate increased by 50% compared to 1 hour ago. + + Current: {{ $value | humanize }} req/s + + This could be: + - Legitimate traffic increase + - DDoS attack + - Retry storm + + Monitor closely and be ready to scale. + runbook_url: "https://runbooks.example.com/traffic-spike" + + # Traffic drop (potential issue) + - alert: TrafficDrop + expr: | + sum(rate(http_requests_total{job="webapp"}[5m])) + < + 0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h)) + for: 10m + labels: + severity: warning + team: backend + component: webapp + annotations: + summary: "Traffic drop detected - {{ $labels.job }}" + description: | + Request rate dropped by 50% compared to 1 hour ago. + + This could indicate: + - Upstream service issue + - DNS problems + - Load balancer misconfiguration + runbook_url: "https://runbooks.example.com/traffic-drop" + + - name: webapp_dependencies + interval: 30s + rules: + # Database connection pool exhaustion + - alert: DatabasePoolExhausted + expr: | + (db_connection_pool_active / db_connection_pool_max) > 0.9 + for: 5m + labels: + severity: critical + team: backend + component: database + annotations: + summary: "Database connection pool near exhaustion" + description: | + Connection pool is {{ $value | humanizePercentage }} full. + + This will cause request failures. Immediate action required. + runbook_url: "https://runbooks.example.com/db-pool-exhausted" + + # External API errors + - alert: ExternalAPIErrors + expr: | + sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api) + / + sum(rate(external_api_requests_total[5m])) by (api) + > 0.1 + for: 5m + labels: + severity: warning + team: backend + component: integration + annotations: + summary: "High error rate from external API - {{ $labels.api }}" + description: | + {{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate. + + Check API status page and consider enabling circuit breaker. + runbook_url: "https://runbooks.example.com/external-api-errors" diff --git a/assets/templates/runbooks/incident-runbook-template.md b/assets/templates/runbooks/incident-runbook-template.md new file mode 100644 index 0000000..59a0103 --- /dev/null +++ b/assets/templates/runbooks/incident-runbook-template.md @@ -0,0 +1,409 @@ +# Runbook: [Alert Name] + +## Overview + +**Alert Name**: [e.g., HighLatency, ServiceDown, ErrorBudgetBurn] + +**Severity**: [Critical | Warning | Info] + +**Team**: [e.g., Backend, Platform, Database] + +**Component**: [e.g., API Gateway, User Service, PostgreSQL] + +**What it means**: [One-line description of what this alert indicates] + +**User impact**: [How does this affect users? High/Medium/Low] + +**Urgency**: [How quickly must this be addressed? Immediate/Hours/Days] + +--- + +## Alert Details + +### When This Alert Fires + +This alert fires when: +- [Specific condition, e.g., "P95 latency exceeds 500ms for 10 minutes"] +- [Any additional conditions] + +### Symptoms + +Users will experience: +- [ ] Slow response times +- [ ] Errors or failures +- [ ] Service unavailable +- [ ] [Other symptoms] + +### Probable Causes + +Common causes include: +1. **[Cause 1]**: [Description] + - Example: Database overload due to slow queries +2. **[Cause 2]**: [Description] + - Example: Memory leak causing OOM errors +3. **[Cause 3]**: [Description] + - Example: Upstream service degradation + +--- + +## Investigation Steps + +### 1. Check Service Health + +**Dashboard**: [Link to primary dashboard] + +**Key metrics to check**: +```bash +# Request rate +sum(rate(http_requests_total[5m])) + +# Error rate +sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) + +# Latency (p95, p99) +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +**What to look for**: +- [ ] Has traffic spiked recently? +- [ ] Is error rate elevated? +- [ ] Are any endpoints particularly slow? + +### 2. Check Recent Changes + +**Deployments**: +```bash +# Kubernetes +kubectl rollout history deployment/[service-name] -n [namespace] + +# Check when last deployed +kubectl get pods -n [namespace] -o wide | grep [service-name] +``` + +**What to look for**: +- [ ] Was there a recent deployment? +- [ ] Did alert start after deployment? +- [ ] Any configuration changes? + +### 3. Check Logs + +**Log query** (adjust for your log system): +```bash +# Kubernetes +kubectl logs deployment/[service-name] -n [namespace] --tail=100 | grep ERROR + +# Elasticsearch/Kibana +GET /logs-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "service": "[service-name]" } }, + { "match": { "level": "error" } }, + { "range": { "@timestamp": { "gte": "now-30m" } } } + ] + } + } +} + +# Loki/LogQL +{job="[service-name]"} |= "error" | json | level="error" +``` + +**What to look for**: +- [ ] Repeated error messages +- [ ] Stack traces +- [ ] Connection errors +- [ ] Timeout errors + +### 4. Check Dependencies + +**Database**: +```bash +# Check active connections +SELECT count(*) FROM pg_stat_activity WHERE state = 'active'; + +# Check slow queries +SELECT pid, now() - pg_stat_activity.query_start AS duration, query +FROM pg_stat_activity +WHERE state = 'active' AND now() - pg_stat_activity.query_start > interval '5 seconds'; +``` + +**External APIs**: +- [ ] Check status pages: [Link to status pages] +- [ ] Check API error rates in dashboard +- [ ] Test API endpoints manually + +**Cache** (Redis/Memcached): +```bash +# Redis info +redis-cli -h [host] INFO stats + +# Check memory usage +redis-cli -h [host] INFO memory +``` + +### 5. Check Resource Usage + +**CPU and Memory**: +```bash +# Kubernetes +kubectl top pods -n [namespace] | grep [service-name] + +# Node metrics +kubectl top nodes +``` + +**Prometheus queries**: +```promql +# CPU usage by pod +sum(rate(container_cpu_usage_seconds_total{pod=~"[service-name].*"}[5m])) by (pod) + +# Memory usage by pod +sum(container_memory_usage_bytes{pod=~"[service-name].*"}) by (pod) +``` + +**What to look for**: +- [ ] CPU throttling +- [ ] Memory approaching limits +- [ ] Disk space issues + +### 6. Check Traces (if available) + +**Trace query**: +```bash +# Jaeger +# Search for slow traces (> 1s) in last 30 minutes + +# Tempo/TraceQL +{ duration > 1s && resource.service.name = "[service-name]" } +``` + +**What to look for**: +- [ ] Which operation is slow? +- [ ] Where is time spent? (DB, external API, service logic) +- [ ] Any N+1 query patterns? + +--- + +## Common Scenarios and Solutions + +### Scenario 1: Recent Deployment Caused Issue + +**Symptoms**: +- Alert started immediately after deployment +- Error logs correlate with new code + +**Solution**: +```bash +# Rollback deployment +kubectl rollout undo deployment/[service-name] -n [namespace] + +# Verify rollback succeeded +kubectl rollout status deployment/[service-name] -n [namespace] + +# Monitor for alert resolution +``` + +**Follow-up**: +- [ ] Create incident report +- [ ] Review deployment process +- [ ] Add pre-deployment checks + +### Scenario 2: Database Performance Issue + +**Symptoms**: +- Slow query logs show problematic queries +- Database CPU or connection pool exhausted + +**Solution**: +```bash +# Identify slow query +# Kill long-running query (use with caution) +SELECT pg_cancel_backend([pid]); + +# Or terminate if cancel doesn't work +SELECT pg_terminate_backend([pid]); + +# Add index if missing (in maintenance window) +CREATE INDEX CONCURRENTLY idx_name ON table_name (column_name); +``` + +**Follow-up**: +- [ ] Add query performance test +- [ ] Review and optimize query +- [ ] Consider read replicas + +### Scenario 3: Memory Leak + +**Symptoms**: +- Memory usage gradually increasing +- Eventually OOMKilled +- Restarts temporarily fix issue + +**Solution**: +```bash +# Immediate: Restart pods +kubectl rollout restart deployment/[service-name] -n [namespace] + +# Increase memory limits (temporary) +kubectl set resources deployment/[service-name] -n [namespace] \ + --limits=memory=2Gi +``` + +**Follow-up**: +- [ ] Profile application for memory leaks +- [ ] Add memory usage alerts +- [ ] Fix root cause + +### Scenario 4: Traffic Spike / DDoS + +**Symptoms**: +- Sudden traffic increase +- Traffic from unusual sources +- High CPU/memory across all instances + +**Solution**: +```bash +# Scale up immediately +kubectl scale deployment/[service-name] -n [namespace] --replicas=10 + +# Enable rate limiting at load balancer level +# (Specific steps depend on LB) + +# Block suspicious IPs if confirmed DDoS +# (Use WAF or network policies) +``` + +**Follow-up**: +- [ ] Implement rate limiting +- [ ] Add DDoS protection (CloudFlare, WAF) +- [ ] Set up auto-scaling + +### Scenario 5: Upstream Service Degradation + +**Symptoms**: +- Errors calling external API +- Timeouts to upstream service +- Upstream status page shows issues + +**Solution**: +```bash +# Enable circuit breaker (if available) +# Adjust timeout configuration +# Switch to backup service/cached data + +# Monitor external service +# Check status page: [Link] +``` + +**Follow-up**: +- [ ] Implement circuit breaker pattern +- [ ] Add fallback mechanisms +- [ ] Set up external service monitoring + +--- + +## Immediate Actions (< 5 minutes) + +These should be done first to mitigate impact: + +1. **[Action 1]**: [e.g., "Scale up service"] + ```bash + kubectl scale deployment/[service] --replicas=10 + ``` + +2. **[Action 2]**: [e.g., "Rollback deployment"] + ```bash + kubectl rollout undo deployment/[service] + ``` + +3. **[Action 3]**: [e.g., "Enable circuit breaker"] + +--- + +## Short-term Actions (< 30 minutes) + +After immediate mitigation: + +1. **[Action 1]**: [e.g., "Investigate root cause"] +2. **[Action 2]**: [e.g., "Optimize slow query"] +3. **[Action 3]**: [e.g., "Clear cache if stale"] + +--- + +## Long-term Actions (Post-Incident) + +Preventive measures: + +1. **[Action 1]**: [e.g., "Add circuit breaker"] +2. **[Action 2]**: [e.g., "Implement auto-scaling"] +3. **[Action 3]**: [e.g., "Add query performance tests"] +4. **[Action 4]**: [e.g., "Update alert thresholds"] + +--- + +## Escalation + +If issue persists after 30 minutes: + +**Escalation Path**: +1. **Primary oncall**: @[username] ([slack/email]) +2. **Team lead**: @[username] ([slack/email]) +3. **Engineering manager**: @[username] ([slack/email]) +4. **Incident commander**: @[username] ([slack/email]) + +**Communication**: +- **Slack channel**: #[incidents-channel] +- **Status page**: [Link] +- **Incident tracking**: [Link to incident management tool] + +--- + +## Related Runbooks + +- [Related Runbook 1] +- [Related Runbook 2] +- [Related Runbook 3] + +## Related Dashboards + +- [Main Service Dashboard] +- [Resource Usage Dashboard] +- [Dependency Dashboard] + +## Related Documentation + +- [Architecture Diagram] +- [Service Documentation] +- [API Documentation] + +--- + +## Recent Incidents + +| Date | Duration | Root Cause | Resolution | Ticket | +|------|----------|------------|------------|--------| +| 2024-10-15 | 23 min | Database pool exhausted | Increased pool size | INC-123 | +| 2024-09-30 | 45 min | Memory leak | Fixed code, restarted | INC-120 | + +--- + +## Runbook Metadata + +**Last Updated**: [Date] + +**Owner**: [Team name] + +**Reviewers**: [Names] + +**Next Review**: [Date] + +--- + +## Notes + +- This runbook should be reviewed quarterly +- Update after each incident to capture new learnings +- Keep investigation steps concise and actionable +- Include actual commands that can be copy-pasted diff --git a/monitoring-observability.skill b/monitoring-observability.skill new file mode 100644 index 0000000000000000000000000000000000000000..1a0c75a165d73bf56b09d8195b2bc79804d7bc7d GIT binary patch literal 102073 zcmeF(Gn6Py&?exqJ#)vlZQHhO+qP}nwr%^4ZQIx{+JBSXbdwuYQb9+3PM?0yQzb72 z41xjx0003HYoV}@L$))&ep=&&e6iwjKY z(2mQHPd4hVB@etwRqd+mFQXFB9_rj8A@bZ3dvXN&z{xjyDK8)QSq~--A*v~bpjT)7 zM2N8P$-1I2}|$CA1BG$DIH6?yCkp;R3aJLQHCsTrffLJLZo2ntf*9r3A~!kA2^ zj?Cec=zwfsJ46ofU*h`K64`B9PVaVhZDj3WaNx0ZESy_NcG}-WbGam(Z0LDeYT)dK zP4ni>Izaa~S}6Sft(*F7Tvl!d%8KcASXTc30(gG1pHHYC>uNq=@DAo7Xy(c5DDvo13Wojeen07kUc)0*7D z<2)d?SyYSUqao45nJl)Lo@h-S z*8^PUL6|SNx+}F9J-_$My1p-suYG%B-|x0x6ryaoE#Y@*YnaYVpy7zp{e%-=Ecz=n z?3aUNQJmuDo$0X612Vy0(8SS_aFAY0h&MZvoKp&0Q=_&nQ9<~(VdY}hCw@1QCYHTk zca4HRo$4*Zy0hxev7<}s?v>*>&oHbiA~@=luA9f1Qc!gHDCPkv#F-LvTGtPphPCa1-%T49)`nM5@d*nQ``bMK$Nm6d zu0WPRW=JfpwJ`d+`Y9&9Na%c$;I4)k8zMeB_byL?-VqVp3}n&X4;We6!}A_`;PZI( z&~;lk=B^Za9C1!zcySKX&byNB&=JWqPXG)S?AA??zw%081HrK1*R5a{puk{>c5Xo+Kd0x!+(me`te;{v++zxwkCBt{bs>wGD=Wh=? z!7;h`BfGc=O$BXO{Ebd2v@_q|$_Bwb(56iLppBs(|IYHlJ#!nCDzfCF45UmyJ0_Kw zk2bED-+vEvx!@lQVgqc$J|8csH7vM`j_qC0P~SEmC|EM6vIuC-Cml_Jmh;GPie|VB zm*{Cv#@2tH<9|8v1@Qz1gWaGFu!s(@1nBth904~O3VaOa5nTpsYZncJCyIy+GRnYC z1FbnD08)?>V-6k0=k~h2WuvYc;euG-gq`Tr(1b<)u*wIgS3!h%02 zO0mWJ!Bhjrb`~gbvur3>0&*s;dGTD}+}K^*W-hGn`!MAbR z$Hp{(EY+|K*P%G!lkKE@)Uk~x*mS>k_NY(YOBi^ah`~9dp0BVKp%Apf}G<~3$I9H9)w z?6;8pC*R0RA2Rd@gad>vu9CG!Mq;>!4s~GG9WK6l?{C`z$as6m z(GyE9e)dlMI-f>-rY_g;qtUx6epq-oXb~-&aZX)FA%S-+D%4O-c*>*x-Vryzz|sv^ zzzvux`sA~NrJQV8d3IoM#JeLv{RoIF_*RSdFME%M#-4Df0>c;ywrwz$t`4Ys6x!Z4 zQ0hV2g%C$uZUrX~t@^>vj@c04!WogRj2Luy)dvzae4WWx_ya{eg^WH}t%Lj-%W5^5 z(93!MP6+GKSdoTjk<1QyH2a7pqxh6|?+l-dpnhtz=2&q8#Itiyt_OVlJ|Wj)n}`twMDgm^S8N+#2Wr&zbR(E2xG?1#q& zV&7|>5An~TR<@Kq#X`f#O>3fZ0NNO6HR8cq8gbk;L~%Zg%GuQ!g!03@nGjr;f2x}C zSzw?Je)uT3V5AjHtC}o&hYryq7A}$LQKew>#@TrHA15^`$yqdc8_AgHYPwX%qc_Ay zpjKAE&D_>NLU8pove;7NC~)5gdNcz{6bsg2gh7cnUngJH-(X#U&0byXJx_2TVE)Jxr#)wWY=&BcVXyQZ;v*Q8zY3U4*AFmt-!R)Q z%mX69slwpo2;bS|JB@~*^#ReO-|fwroU5)VY}{eF_v9I+%s9HJi!KE^NSeEV=hO$u z#~e}w=_u5oYd{>P^8o%95Mg8@*folSm zJ;Y>DSV_j5iM$GTuV2Aw?n}_E-P%Ih$z0vKx?T}q=QE5~POyDPen7+x#gWNnP=utU zhhL4>*n-~1K^5}UX@c2L4hg)Kp?6e&UEiK0(j=kt2BrdKxE9?_z{G&7*~|{Z*9BDZ zou27@o?2_~dLrM7DL^be$T{IV6m*qtMKxjXB?xlMccD{S0Ug1K1I&pYe9QFZTE9 zwOm3c?jYj7h%VKVKTMZxdou4Jb|lzu{Vcy`0g&G-ZwF@-ONEO7z;$jb0Gxm`e)7Gg zA~8_dT;RZnqsVt7BCDsVD-OfU>@!i8>MFuE5nEal8+0&H*^&z>aU~v9vYY`+Rw1c4 zio4WIghgDblJHg>-8EFn;$Sc|<;PLwUaWvH^$mo}^G+=w!>!g>lrlq452i!JfaW%m zn7YJ-e;w}^6>PtXx%0=#H}3Z`@C(X5+NFUyn=X_PPFf5Zo*F_cY?ft^OmYWg3`hyN zl~0FB9A!7Q4y&$|ZF5_4w##P#*)qQ}!d9)#j_4h+Hc!TO`UXPCwrVyCeU(RpUieti z<6a5___aC2cDbu%4=okg^v?2iPoU*q7WV%g5RI70)<*zru3?_?*%Mn8K{b)I3iz~; zcE(Oj4b;7sXO79Xb{SS2B`43)WFA11pA#T4pDo5aW!e+e;)cz~5=TZQm}5y|dM|pV zkG&)tPe|aQ^s5a5!%$?W3K%Q5xYT7oiC|k}4x%^K7s@ug>5Dnhx43bZKqCZ?Ygn}> z&Tz_zmdIiJoLQ1G2FNJIh9m~!RD|G>tkxt7C35ZidAR+rPW!HZe}GlLqw>YscPi{4 zh6*_hEqDUCYBfpApU`eTeK5M-1ia-cCw<-=TDypx>rcRMKBsoQ*?zxYhYRi-6J$U8 zMM-kaehfR}>_ z6!;ZTk2dwQbu(*CTp>;|+9~xd(5fj7U6(Uy?X*TcBa)dtRBxU8@#7rz)v?H=8r)o! zfjSDZWa2$~u|K24{XimG$R8Z{X*oIpnZ?LHjmsrP>Y&!!Ag%7)M~6aJjd@>>QaDDpSjI&0qXNVU#n1Z8hj>;%L#}pcF@PmMv?`6KW)erW2ZY4WMNh@v250 z#C1D7L2PWh4o@KcSZo{6t{gTnr(SI0h*a^O3V~UE1a<4N7AqCX7jrF}0t(^?*ca~jV=F0jS^YK-C z?VFv&bi5o2UZj&*?~Uu&2j##6q;QD5B@|5oS3lFy3o@!)D7>WFX!j;6_{tH<` zLEe1Aw#nmgl;r11)5UD8rq(+8CZRVfQFBe==uB4UK>w%gu%9-18v5%znb@D`|Gs%*fMl^3T9+2Sm3Ci*pooJFb9Jz}HC`ZOzzCD*yK_02)xfXDq zLV~f14@0T>=fbXM-!Nx#-EZFb><*x_Xg4?7QoWXwH*n)(7~+5*3!oq9k7XzB(UgDPVZ!dYewIs~)*I?Z1ri1pj*aBU6Hh{C zWzm&9`$ek|Co9UmtuluLQe7*D>L(LZ-Zfk0)C98ftb$oMe)F+Huy72_tmW<2*5*xJ z0#%iU$=96~ht>0>*wXi7$O{^XE>M?(y0QVmrEJRy`+01dvs1(k9|NCMdSQ;<74^f3lPso#U9XUw;J@rEE_ydJ;ZuLtl~NIN%hF*!Hp`G4Qk>8t zxLC+3he5(a8+{9ZSi+k9)c-?XTv>U4x#M{t!Y7 zN@wBShL0uC^d@Cc`a7&NYl1IuL%DW&uQ4W}W?TZ31ZDGqH!eACNlyXc#39E}HA76` zo=+&&P-2JU!o1^Uqnyr{(32z?l88x%72A}5=_Y9$zGR62VeSDi2zR3SG%}ER_Sm# zY5FBg{~I;`h*8SU-di)?N7_>t)2&?=47rl1W^~Onmmt6)PN1dSgvMiKrUW^&QVQ*8 zh{;7z*%S~evgtV48&?Df?K1VL@k03sG-CcX@TZ!pqj=(VcJ|J#YII#FXsq3yq%MjE z&=1jreasBJpLs5JbQkw$pp`m1mC3q+!m?m`VjdBBFH{ZX6z~X?-04#A<~NNVy$alE z9vvnt_{H<&lyDL>q=aRTtJ0JmDXxgywdSk}+s>q558EcVl)Di_J_(H-@|whJI2tV< zNj~}>p>7F%EhLFP7VZ8j_;1m>cp0rcMZaVxv!ayJyy06Q9!Y+NF4g_^1P+R08DAyt zOs!aDuh^6#M+g7vZAq1U%VtomKnBIV16NUF5*^x=;avli6zquU?&i&|T?2Y%S%2JL zd*~0eJ4(r0PELHY90^%cFc*A=AaRuF;e1EqaGRV;vTFv}Bb2C%Wa5*sH?`Iaz@tHC zPbvzo=`9Z^ej}dW%Rpr$io2uLEvmOnr5J$lpci}YjYqg~aD{~5zAp8jO z7$eLPVz>@U^lBDbpp&O%a)&MM_G1{3Cl*evgLp6f+D+Bl_aE^$rF1cF)H7TCpbUH< zobZr@mPiq&dZ8ZiGXn|?`q3KR3ixz12;r}RA)G7sEUpuKilEqn)=x4uc+RUErGWKW zpQg0b(S8_I-GvMtcj=#W8|n!NK;kMoz-?-E7n}2Q7WQNjo8rxwX-Vfj+qSN%t-@-g z+FjS(&E-?xKnVj&prbD29%S~I{w=F-3&7#Bn#XAFGfKg_lx^5}jl|2naP4XTWJYPW z~xRFKq0t$S#nm}VCmW-YMGN{3XMPYSwdxwF7VbVlJAuX z74S@v_Ehc6+P^ko{kA^(7R$O4z)G_$9qf-AO6yRMvvdY-~84LOx*3JRE=A1MJ~&vZwJ)$t0qj2|sfkzJf= znW;R?Ar&s=6=Z3Shd}1)f#v&?BnNXuq(aF6aUVN}WBRGStv#DQj>kNh{lBlN{MW-` z{mIYave`p+H8=tq#gO{b^98V&e?0h2Cv@PUyU zmW?V4O<1mlnD223A;lsE8Ft?@^%WE$`5^iep|I1b6C~TfUIRw0AP-90MDTBlUSce! zF?VGFkj_-{VDelF4KLA`J%Y)jhXEVMXA?`<%S1%boyTzug#aR5tq*RV*E2?1G=gHdAM2@kS=N6y?7}TsOWXYi+x%WC z3(U}=em@+?elEDW_BU~sHMNYxT%_aBuiNjBW&Qp!oUM0tI? z!YNOL4)bmkRiUh$eMHu~W>igD;;P(vs4iuuOi;VHOfzaLq%^muZCWaRvnH1t!P0zg zpfe1%;V6etTTMa1ul`iw3q0`#lPbAi+Lpafi1c>Pg3DW{Is%c-LE(#pm5(?XBUFt# zzA15^1fDqPzUZ`M+Fh?W*3A1{BHB+rg_2~|p}G`Yxr-T<^8yK_f21tWueuMh5KZJMUg^gxVj;R|GXI_aN+o$naUO8elsv3e}eew5e9*cO+cgDQf9 z%4}ZG;oiFDK7F3h?c~2-E&yx_BNMwo_tm`DR?EtJ@>^E6*7VoD8sVT%aqCO{<&29D z$j!q|fd^<1vP_-|Vbk#Re+*&bfR5>Z(&XSCkt;AJbz3+x0|37cf9yeaE~oFLT-iQ? zntJO&g0fu#rpv@jdml)~y3r=rl0Q|^g;sxFa7bk2Mb8L=IR`aqyKai?9PzvF0lV{7Q zj02xK`^82LRh} zRNxIMJL0lj&L2q1`@jOr)Y#|T>7a`rmrgpBk1X(d#!b-#YBWm0zuc1jvI6-Jh6ajO zl#Wq~Zg0$-hW7U?9CUBhn#vqy2@ugs!Mhe<)Q&+M68aP{Cx=B0Zp-sMRo~IxU5<$2 zhdtL%?!W! z{Zv#13Ysg_EvNtjCwq*+nfws04#Yw;QFxO2>5xqz^klP;gKJ~ky8b0jv;i1uz)XLh z&L}k9&LXx%LI9Q z-e#k-@gy*iB%Nr~;sHvmn*u2yLo9=a^Yb;YP_ApR8~@MLu6s z+{cr6QI^C=cEcjK`T-4l$PHY3r4V7tW>E91$p_y@e{I_BXf=fZGl_ zU=X%8N>nBiv`Mrd?Qtet4|GY}ej${3Q^e9vTF9&63C8N_#ZZIgTw>eHR&E?G&_@Qjp8!fpR znT%QBeJy;_gbr!CoLJo}Ru5lhF4=IWi;z`ELVPx*Df5z{sE1qG%C;H_f=f&R+t)W= z8=6|-BafEA`jRkeQI(N#H*AuwqcG~+u81?Cq#JHD@L0+@l+Q&+hO$bvXo?2Ex8R;$ zFy~Xcjl0NlCSth6#i55*#avCJjG-O^U-Wl|=%drW1y5OzQ7pE`5O5W`izvF6?aGz) z$eeh%^6seD!@$3GyG2esgMW5$@r7*tUL?Ue71|z!k++SQ4|8}M$)ihL1`y_%Hz7~6 z?t*6kXT_H8Ot6HSW~XA0-5ZDZ6y>U&@rSaps(i7yRmueOxo=eVA)ISW_<^n10=ZGM z<)Emiq=wO2g%71b-nocbu>~WLO#u}0*#7A8L$5e{YcMry&!UpAK|VM;Y(|1B6YwT5 zCt9yrb+Vh9pSAjd0I;~tOM|-VMN1$H3)s_}Xzd8B~M|HIm5;LLzDRR7DYaN<>A6!xme; z4O?fK40Wu3Luwvr=16a;oO914WDeaqw z;))XEnyqSgap`aWr3t%T1M0{=S2Kwr`wW=mzh6H0gRi}Mf~m5U`W9fM(`p@!y#O=t z@g&$xJ;Y3o<0uSiRY!lIiTC>Qy5-@us3>TI_=Bb3GuO|X2G~^mA>$9lP|X_>j9lG^ zwNl~p136Y_+P42D31o2Hv>ghwJ?*FY{#I@#NgxP$R+3OZf ztWCu*`=(gWp+6Jux$D?!1H2OYkL>MvPD-4yJgfh4*DTSD3p}$EW~5g7;m#_TZe+=Y z^pk2wgx*tWkYA1{3#8kZHynqPqUTq^NefPqnRcp$uI2Y1cHj2IK#VW>pd`IkB#|-k z?yinC?vV+)&%G?0C-1CT`luN#mlsIv+>_zH#PX!|u*FI6Ma|iTOuj{*jn}B<3H9 z`A1^@k(hrZ<{ydqM`Hewn13YZABp)#V*ZhseQ8rgc!r`-GjtiJsH{#*y-Pf(Jey$Ri%&?% zb|W%u>*(nGt*hLW8&)8kqMR)~C{a#fryFwk6IWu{lFN-a=0R~!F+1vzaAqebTt_5p zXUy@JYD7QvuSD?50R+_}H8x4(vnk<;2hFHKt8N528l{{GOOei?g0322ZR>pom=I~A z+1s*3%gC)AHv*-uiLex^%plZLmnIAf8!5{g-~@SFNRIeV}3tX zZwzxiEn=s3!bqx&SaTu{_3)fslF|=~Pn{IwJrlO}2kwy=T$V7r!aCL$L^rO}`}Iru zxu`0+l{WXhb;AzoR1kwPJ4kYG%9I3re;T73vbIn$N>??Ar;KI8a8E4~%owYd2QlOZ zDX~K&4Y{X^dDRn6Bi?$uNJ(81wcpD+M3rC2aQDPmrqQ;mKMwc{12r-b85N5Tr zyNv@+)V+Qhc5);7MmfR_)K)s9OH@OURA|8(y8{rz+a5$o53HZ47{DzMU-i!}nyCp! zk)rpOjVOpnI&*e1*8o;}+1byqt0V46qm%4W$Q)R9yBA@&ufw4+R`S1i<5HLnv;D&? zXhf=oDTa!PW*b4qpvw|lSZIyZjnnnDSEh$nQyF;b!v;D|lx%FCwF1^QI9Z|Zw6c*? zxS{WmK^B!o8QQ_#VHGv$A;*gGO1meb{NScw^31Op+6N#7E;R>rr96yw$ot_Cn|!7L`qSD*s@b)@r-ge%=M12!uYO-!+8 zzL0u)-POocu)na)6nw3GyQ`vUpgy`DM1neIn?Npn}Bs8F-}>10UyTanby}yDl=J>sd>=)RuPmL z$Kur1)}9z-Hs^+`6)-ll=GH`b8=j7_qd>dQPvF}HB}B)s)qe8?%X0HbB(Xx8VamkIKN_T8t z+du`#hr#D@8^o)&xCnKshwS<^WF63{b5hsWu`!#jyOl!wL<#_{)TE9u*$C&-n?L*p z(C2=07;ry^+*=DE&Gk}-_S_6Pwy(&FqzZtElOa1Z+f0feDSc_@Q?jz>zlnBKd~0FF zwwvg!g+7g-a7ewRrG!+>p_T;-alNh|4D%p{II-QXPbWN;(Yzhm>fr%+a_5GuFOM?1 zk{oq1;!-i3AXd{Jm=7>z=h98tGe(RB5%&HtdH$I2*`R!8|6aSi6x7$z0QDh*7G2J` zb5PZeMM5(jQRl#Xxc)Lz7j(B8AfS=nhpI0Y~ zZ;Y?EIj|ttok9_Xdse!{_D3$0lR_4vfKufmj3hyl8l#v^OWF`7r61a*i$J(h%Guso zF`Nwp(2P%lZ@1Vmh4}gX)v*Hk#bkh88=UagULMj^XmpPaQDy$LxZFwH45|{7&@Ihuw%)Ab47M@OBRAaI2W3Cn1)nS zpX4QG&Ph#_&zP0PU4z<&h(}GA6(VLNUS$-+m31K(7@b4_Wyn{R2upSkbpnK=4}aGK zQ{RQ}Bj{&c%2Tr5{xwBorp|IVM`n`vAe9El=?&L_wHWT3a?*$`uxcFb;v(7;sdhMO z)c}0Yg9yxE;CQZ5A3P=3#o~pa&o*Hc2k;gq{xe;u&j=9+0){Tak-#?HXZkg7ANwaa zMm?yv>(cKrptZI8I-&+y1JEYWf}9bO$%iwR$bU8EOPQm!J4AE8Owa5p%eV(t6EjRp zT$<5gASyUGSQL@Eq_R@b)_(_Zqp8zS>(4N4Rj;*%I-SF zGx$nE-XKcqt^oHAK|N}HJ@BlisIKWvJuL=mUrdu#F4Mfn*~4|P%Apc3gL8HkMb1<|CJs4<*}7my1* zecBC%Oqi`mG&TbZp}l>{6fPmDhl&S`!rfTVkNs<3oVZjrG0DvcLzdb7emPfNn4uw9 zVUYHaAG31GNz)TGzis+?y@{odlamw4IA19yJU7$a!kqPB7oGjH{$3lzjaGl3(Q>0q zqR!}gLOgT~6-McZt+}Qx)RX!fassNb%X*q>eDSFdpdU>IYCmUiHU3MR_!&TMN-TGj zfC%RiI9m)weMInNPTL`mO9699Lavn!lI31fVDBgw$Mf}oMG0A?|tclxM90bD$qVC zf$pgHX(9yoWEtDTCb=bF4c8b&Z2hti=8FgIc3pqqM^;(sI?+uB|0X58G$ zTU_2q(ifCuzK#K;esvNhO8q|>`Y+Yl?7DYkz}nf5=T?r$&YF zW1C}kM+SZjpHyw_hG3se`f|`mnxK9M@qUGmj_vJ&jfLN53=*^lXI7(r{eUysJ^cug zF1DXIjCj!ZvbUtwKNU=a!-nHruq6VCu-5^m4aVnZOrpD+h@#yt8W)~iK1yv`cdtt$ z19cqi`*oP58y4z!ouRjbeRc$lKnJ6xgar(20)SMj9f~F|18HjB1=PBrr{1!3C35EA zA#m?+3j{c#uzdfnm=}WY^|n6-U=&=KSY76T?4HFL(8~?U8Dbj(i(7$~9Aj8Ri-{!YBjHe^=LbesC_lrR;-guP zBdXQkwW-z-wYKUi)zfzLopmsmWWXOC(}P@P%HvtpN$CKQ{1Cb-NZc$MS}G@VjyQ(r zo7jgALt0 zU@z#PQIojD(cy_XD#$3tOQxe37><4eFfR|$UG1zuhRO=%99=-gp1v4zj*Pe*fFa4z zB4Kq6`gVTYj9lzQ2n!ornd+i=&hRayB`mZx+1RJT>_{ zE?BTfU9(MS9dE?t!l51S3D@{8>jRmp@K{7wDW!(R-0NsSs@02a^-0QIOr0k@Er$cp zZ#p4)y6mkAJvM&IPs7dM*GH>5E-7<^?GkUhc$2=CJc!~IbQP*3>Cw>)aHJ20>h^L8 z!w+W)9p+`klbDP^^36rEbQ z=zXN8M5DQ=ZN3BW4uS8{Qz9#-U$ zWgf}Y@FvrSg$4ds)45HU2{QyYD8?Uv3D>pi1j*_+22rdv+P1VhgO%|c4@+tMuk#R8 zG)E1k6sbKn%yW*)rq&z9-c&%drR)5(SbJ`>ot0`men+J=Bn9n=qZy zA1L+{l-^yQGqVSI^U9qw;n zWa5eLk{*p1>l+7s6e^nTlIFWqZV-*s>c2&RiwFgLJeaQs$2&sziE#rdk_Aqkg#FFC zExmH1OROA)G9wC0v~&;RW1+dEFqg}q?)W<+dE*FCjB%DZy65rp!_5(vs*+9a#RP*Q zGi<*>od8skH|P~x;nZYXop2S{ukqk_@w+}TEG-5S#m@5pl8u3*1Qi)E2dlOQQM^Nl zSIMFWf$pqD4ov)$xj^z6x zUat411q2-RuoKMGbrQfdtul6!QS#ljYc)JG#_98*iB{->^nQ(eYlQB<@_7DdI)8}( zmO=PQB+huC+r057O*dpY!M{)iPCB%r`uj+$0(jYwc^scUshEWkn#N) zB`fgTOuYj;9{y#Zl)DJC>BC@i*yuH9h^tFpdlR_YlnRk?j1BPE5Du*Nrg$DnY2o_* zsz9uu4@K+mE^k+*Z=-sMfL`MwpeA&kW8w~6rMsY>yv1ndpiPSf98Ue1zI8!s9}AGU z=BzSMGD?qtsZ+C7%UpkTBla65L`m9DudO642Igc^h8HBk@S~SCM4Wy?;S&y5v*Ux& z&Ff~4R?6rYc1x(D%r|qi@0z5atqU2~eE{Mo{n25y6^oS^h-KCHh&y!D9!o(_VNVA+ zC@j-dD6Wgt?gIvUCp-Y3A#3~T;x&ko1 zg94PnP_t4N#1E#*?R7T{aLrxVq=acNthTF=bY;pIfpB329*=Jv9%d7j#fA{{B>9Wx zhi9soeWYmp71{SQi~MtfR6143UVmF0&YN~b%hSbjM3?y1cnxUU<1 z?Z~0kH|=y!JALBZgO0{vW^JeBi?DV%IO^1SOaM3m2HLS*p1*Wrr4{*tXkbyx^%OeC zc~HCxEB>l$kkS+5*cn9%X+B+pm$H*qg|S9DPkaCt;)FL%EWPo{m0BI({OZ=ta>8Dt z5J6zq5wqtd0V{l|!!QrctOqjH;|8Rq+WR!*?Kko*>?m72ivhJW#h9!3SyRZt$sh3G zF(Kz?Qln|0xp1hu1RqlfwvXEyLBC`Z<@a)P3b#2bGGpe`R$O*@Z2WytZR~;LkwYyx zT*H7v_a^4x%i3-X0|9Z?*7X6t9mLZKzTR=3A!WTo7C!tzmxrO)_FN;5<^+NaOO`bFT(cou8+1I(3_rsr(vGfrDJtMC!pbD zH56Y90CG$880Po9wo;+JS)B>9IFE=jgYk%M#3<*-y|hV)3j1|Jc_(%DGv7DAX%6d8 zJV75zYe8ntd(lbCkFjhIZE_2HmHMXDJ$2m4SsPj_6%ZX$sOyx;409Nh7Bq-|*09o& z$?|5?DlpG{#&86;wr$h{%A8>jrsKM44(OfR1t(6cOs96|4(C*$5fmlW?aeh+4q!9J z3RqQc8UscgtXMGJene!0T!Iqgu?29ih;HT@v6pSqO33JrYaR$k=B=r4ToFf)Bl6u^P>ro@77UP`LsxFdV1~C`N;MY{?-h)#JEf4HvU8}n2hf$SlWUX3w3!pWP z;f1~E%!yzQZnFII$Jb266Jwxcil)!R`Z)9}e~)0ZF>n^a)5+T9e3N;9&T|cO2y<*Pc-m85c9mud8NC= zdBsd!UVh{XjoJC;xJc+w9e!qJeoT*98oYr{w#g7I*iJO16y4Jz!$ErsMCh~Z-SHiP zX^gS%P{3EF4RC>6oce{vfi{kqk20`9QOM|0@3@R4r|qQl5GIKfyoVL2PH;{GX!vCL zX&ZMSrr9H=FVU^OAlo$TMxkZqU*PNzdu$X25+Z3e2 zzabTpc`I+KnAkf^B~YEHW>;4J)W&;<2Z_z^UZxb831V@mP-FOvYgh_TV{vH_7tAJs zP0nC388s%T&6KlGi9gDhCg8?6q0Dt(lI${Pk#jH`?pPPD?%U7)$$Bs(x)?GtRxBpf ze>@jj6-Bg!Mr+Cf?0*qTgD}a>Y>CPj6jupjPy-vNXwN11`C8FoOerkQTQC^_d~$U~ zikT6UL`Q5)-*~O}QRDKywjGdfm=F$T@y2-!-tXD(p@ZCpOS7tfW{qLj<%)?R8HzF; zwNJTV4Tc#YoknU*FCm`upJYm$8`K z7Qb?znGvjf)i;$qS<*<&0^hTYAs)~~KTk*q9l6mBd}ccDl&2#7OOE*wA4g|Qw}7x@LtM6+u}6Dmbf-AR zAhui=l)b${^~A7W?&cG-c>yypbnSu|s=GJK4Wu-LKhp!}VY*$=BJ&NPz0xY=9UQLe zO#P}mk%t5G^4qwuxv}wfgG`HLupD?ZD=U9vH9~BiVe-P{+7mH3L>(n@Tnun|zJlTcusueluX%pS<4W(@2@0G%%`t=DGi#1wIlFP9>#oek@5|&&#S>YpB3=H=YZa^ymrr; zROy9%T?<&woH$eXbh5FKOiU5?&`zdgGQG5fjQd~&oh^)IkIkQa=x3iFT9qngX<1Gxk6^DmfMpHU~YN z_e{x%UJrV}K}WTKEt{-XWh%=>qKAgT#FhiiqF7{t$2CeQgv5qEqRTrfGnfp!r^}#X zCT&!Pi!szE!ZF>vjt3Y(`U4>k9SgX;w*sTir|-;Tnz~Ga#YP0IuRvpc^;h6lE$xSr$2lAw0~6+u?u&MDc)Hx| zLI^sPZxA z>a>4H0;GxxYdRGDdog0*4?(pytT!3cM3JitI=AmO1@Rl;bEv}Jpmfp+rm4_-j5!{n>S|P}Z$;mRlm&AEOKq?du&eyj4%4wMWeHZt=u|h z?Z2tjSTjCMfd&mx0Vn$pKdD)x`8?o0;!BenLhF)xp;tyDk|4JshInWEkbC*pzwL>?BOGFp8bqpqgs>JS~)JA z8wqa=gM!PlGT{tZeQs?s!^C>h5IVa5*G@JrYe0_ASI%G6-qV^dj~KneUj&g*WHWW? zXHxBeQQRwW+P90}_@E>Q_w#y|K1HK#bBfK*yvgquM|QD8o>4~J1y)#lxpZ&{r6>Ec zZ%;|9XUYW}C*<6DA?`Q_6mhf`Yd?d$7$yjjr5h9O^Z5dFCw*V3@n!Vp*##?i5l=c3 z8cLu4{YEP{Q^4M)xU`t18fCZF%EbV2{F zRjNA8cyeH)Q*bfi{;q1#|6=SMf&`1UESk1$+qO|@+qP}nwr$(Compwyc31bH z|MW+^h&y}38?pCY=dAfnWzMt;*lskdhUp?HQwHox>c(u}`tK4Wh-F9lwY~jg{m&zF zH*84T9D+Q0_$KSkPfsb*8+m~|izgE?SwMQkAIu2N9fBOe9cA%R;j}@ET*im``$mm` zNSbO<*gD)uMTP)|Wea6vcd_g{I_9HZ!|Go(~=cQz6!XS;L~$qaeW+21%uKl%_;UG-ODh|-3avu6WU`#Z}Nu!Ppf(f{wFMNI<`=5EfZbtRV39EMf0kT6zGMpE70neW zfus^{<&W*4eAG{e=rV&x9d;CO8%)B zPa!=8{iXdVth&h2lH@S6c=r@8W0A&Gb5ef=j)TMbgW3{r(H#uBQmHsge6uJ!cP{kn zXzcRueswTNCa+Xf{C$ex`9t;T>!D8XxK5lZx>8e9WBM3*@k5uf)*37_g3ZsV+ew@@ z0>lm6iW}0pYwp>QqF3t8(KQ^ETU*cW=_vGkhPfza?hUzJ90A#Y;%QN3-^*QU5d(bo zQvQh(cV+5q$T-%NN`*ngF{aIz6ke@s_?(QuYx2~6;1W9-M5mF@G>1eg4ywb>o~b1RC7y*6GjHkQ`*Dz4X1LMwj;om5-8#h~u$ zY*v-4Gu?^F#1EcK9`(6)n}3}T_Y3_$u}hXmTdR+KnZDznn@67;0D$Gcj9t$Ekd6Q4 zH_Titj7|Qft{U!by-o4N+YjoG>>N?kdTU{Vh}Z5W5a~fW5U`|~9g~x?=91Q$e0VWh zK;2lME}zb_uC*>d+n>a$%A3_N8<+{0B)}>a9hHCJhMtb^C>(G`$pFP%@Eo1e4d{Jv z&jhfZ`f8Ke)ljk-O?*|pZ+-QJdFkoFF7*V5)JU54c(lVrV06sT(Vs@^OtN?5$wHp# zKn;p?lyjFoNPq}~NWB5_QzqnuAZ;EaKSU?j_~G1ilf_0h_5{#|jt!%rbvqLYsbRj) z>>~>p`-yfExcni^I7CN?X}|$vGR0g_iD}zFYf0qXJEf52?8Me%*f3F+Bka$v3k?Xr82oyuk<0hW**=7Q@^I~N23?8 z`klKD&)FtQY^q@v0p?2Ju>P5FDT4-7{=TnAzg77D$HCz^5+PUbbtCU1mo_(FT>ycO z2HQ9pmaG?B^28YxflYs)_qI0t!1eP69JC|L1;(yT4~Qp^xOZZ7k`$ncG!PRVGE8_c zYi!PZ8zc69Wv&y0@mgSqp(B=uw9m^KY;xV(k1I1t>BupZ{pgI3` zKCFCSsCg0ov^N@NyicCz4Gswdl+g$AVN8p-)u|woOj1UeMAgCC!Ur7m{Anp~^_n!O zy|iQzTgIEuXfw?@1CRW|Fuwwiauya?1e<{^cO2;lWd;XsDMj^q5>i;}@qGdc`&3kDVVTeh;Goc0U_4C4dl~~`j>bC}s&6woGNNt6G$6Frkssc4 z9nc2V%kSkZFiZsST=CA&CJhYt3zmY|LS;;vQg4d(l7h9!=z>KuqA1O~^|?l>y3cnj z0w+8$KXS55DhKUd1t-fA&8v5wtik*oDdpkEQOxvng~W)MnT!j)^qcocU}T z2hxifW%8XTErOn|9*(gbF&F;U7&Y3olr1RaipI5A_GXzbUA;(M8_M;RmsK6J#p%Xf zw$sG>o#p1{r|3^RLZ(tXA5zvE`6^8$;Z458(mZ&w$QjSMx6XPjnP?d&83~Zmx@gHd zLv@%ncWQK%=l$m{mlW3@MoJbNCnuC~g3oBMV`~-DOU5tctVjL>lFVsRYg0=o_DDsp z!c8Y+lr(@O2T}j_MQ=P|f2+mCqRCbzRPS4%F*M(XP&Q!)k3xQpqER#O0ZkW?#4)Ge zU8Fo-pjLwp2*UInM3)H()dO5jzwr=}B_lg@Q_7JK)+%|1XXZu|O*%q8VKKeR1I3tz zAgQ;iBNr<;L?ME!gM9_D0U2U0Xpt4ySGxCO((&W?W7~tFqnn}6fU+s=4*+Q!P^=dY z!owFc`~^W2TGU)5HN_9n)&(pw?#jG$nbL4JG|H@Ijb%vrccm01(QbdsUTyMaO^uT__)|C3Q7qKpUXVm$h-{=Y$$)h-knd>A}^eT2mN0 zs~b^fyrDVcgBuo3E=cq7Gp>-=d@OML2#0#o>>7e36{P+ zgg6)iR#DIXqr4fz_LhuxZbcm$fjz^I5}q1!jErbdttlEzmc_xu=0%HV6vfKaJV)!S z6uw~D)&kFDs)5WH=g%IRZ?6I7Iv_WaMsSAY^nyF31i{IEabp~`4Gbj=?jyJzM^KA; zQ*uYMGEtE(&{~x489Ao@_jr&Mghay?>lT5io)jE zKGdfyMkQIBP>t>1fRj+RYiv*1ncBm7-beeSblOD}oYs?x{MDDw*uYmj6VfQv>}CCd zPK23dM#%2GE`57C{Lb33bu=L?j{h>Z`uBLM@0xB{ zGxGdEDc4X=*Bt~Ifi9-?ftAQpRI1A*xhLZE{;;YOi|ogpMe~q|V7FNs6kavNH3zsK zsEJS6+M?E}(f2|%coL$Re_L;3fBO6G%RI28F-Xu{|FJ1tOedKQ1eW~^$-}zZM+ehj zT?AXIXN)>2XP$~b1Y}zv(+>-+F3Jo71RLGJ1U^6i*O2b=StBO!QFd!Rmuj9PT})OH z0>?*dOlbqgtUMl?0DamQQGwDQUXYEHnwF|m<2rOOl!sf`NE(_2Qw081u5qO=<1GrJ z6-9f-)oX*6S#jZ*+Qz4i4gfM)QjWU(?i zwQElsF4?JI1Vab}FWO?XPe30bSkruji7vTB%%df_kDy10;+xvznl%R`X2uwsCSp^KKI!t9(yi^h z=x^4wq#*_!73WPPE{0V$u^Ys2bgff%$RS%KzAh8-e4ipTUOTxIZVR@s4?^>mb_eJR zh^6EbT%ywPb7ml}jfM({=*_`_iKE>p{Jbb>nY>%CUn3#g zxy=nL1?Qe14fAMa<+1Pf(>nMbP@2_upD|#MD|w(KE^?fBC1-SF#NhJ%W(PRlXrSRw z^l=De@cm$y*(xr49u#N1RTCAobpC9 zKtoukBRt(+%^n|K7Ara@389KbmP57EvwGRm#oc{W&-P8#P3|fG?)`V#TwWha(Y5WG z*FUFH?}KI$`qnOX&)H4S-C0PGEhx;jy;_Th6iOAW`Rxqzi|bp;DwkuH(XxavO4g7EbAv(ifW_d- zOvpXygAi!kv5>>&sMF&}V4GYL3|OpdHcGVDd|h2_D2iHfth7O}kIJ8fL-1X7pBw?Lcw|6evgKJ`e3)4+_XlwW`O22y2GPhuAe%;Uu3+l}069ZWLaE-Lzs1E;;hAj;N)q!y z4H9Iv=?YgL@hhZDB@%Xy{eWKN3mKo&qEv6b!`a=ZF!}&BNwr_aF4?GJ>^iK2YIY?N zKrx}*u`~I|YRZtbLKF#>W5`ikg}>2O7Zsron5JV2z94+2o?xXa37VBjBtEO<4_7^$ zD7i}i)>MCV6j7Gv7`_x>Jn~Fu_Dwvf+3ww^S0vRrjtMZ0ptXuqbhs}OJgum_aY^tr zm+TUS>5L6aj*56!0X*ee58OWkN${-!+Yw`y1e=aM>G@==nzJ-NFCgVRYoh_3C)QqO z%ZUTDNwdZHz67O>RJ816rq_)sW~(7{+X6M-2cD$KU*P3A?rXdHOPyC*sv6 zOQ$qzuzd1fZsFlJ_>G_;r@zPDxa~^1B0k+6Ty1#<_O7({b|5rg#Qs~gth@l^tP})^5R`*IAFX0be^*6lOH_XnswY;|F(}-T zY(l49J-fN?*}7b~1(Uq`c6f)Y<5+I#_zT{RW{^&?Dxga=K1Br)3R1a>qz7(+l%z&| zf|H%2|KZ`fPgKwC-uC-pY`;E7fXu)+61RL)ByGZ4D)8co0xSR_uRDV9>I~=xS4Zf! zENjPNHWo&Opur111fde1Y_sxQjD_WXArsS8a2_Le`kVh+s+T(Ji(hztWB2d63AC9U z<2k9!JHdA28|r9@$)l*Yc{ZJ~y|s4Z;{965-&WMF-nkvQ?v)+nK|(c-B|0T&>S-PC zo^0w}jvyNE^1L!CD#=XCc0h>y0kS?cYq~yGSAlDHE;(z5v=tAbKv!EeYDbx(`U`># zB&C9`T;v$dy{;@xCnz) zCiEOZQbe!zl`?IKr?|7cG~NWcZ&M!Bfo62deh!Tz0=6wNLL(LGC&kMUd9_z6fNgT z#UTIEJu+*aohVY0Kk2L1($W_d=8mdXmGT*<*@d32d_~AggUdHtO6UCSl41o5j)lb5 zND)D+;P>j2R~lqIb=Ywr-m(%Z z!83q%EacYjW%M2^M8^Y*F>h2IO*YN)7&<|6hj9k2dtg)_%?{BbyM;!)V4s6pOr)Gv zzNAl<0Sic5EggKNsYcypaI#6Y>D+MXkfcz_dcduV-Sn!vG-@{qpOll&gy9M$z1c23UqX3XS_hA=U8 zz%#(0>np(Omezo+to-MpgCV;!G_-AH=h@JcVMzLAdd)YNx!uMn6i~q3u-TzNaBsEyFVl(OW4<<@ zC>8DGvFC8fV(q)M{&WF8{}jpSCKa-aIh2@4K8!64R*E#fYGAIoj-Q@?L-%!x;ngO- z-^cmM>1|dF6b-9ThQGRKHb!X0c5E&9F7SkDuKTH45A=s2dpVR;d#D5qko=7U=g^wn=|a z${6n#99KF;rUS;#Uh~ivpLG;@=<1c%b^gwp?c4Q%cds1Y@}0g1ZW7v+!t!QW*h_y9 zVfgw$3nt)WAURGo5j07P?C)@$sB+dl_u-^2SWgQmf0qU*s^F1%|y7Ay)OrvY1fp( zpiEpcEhf049x<n?KG5D(|Uf?bBVflGvC>a`S}j!S90|Co*9#VMTh} z2{uOd;zd;;R5%_c(hSW&01~oOD_^p*EJ0J+iu+jU*x2ExM6@*ah!%jf@&59Egd+N4 zO6}%ngz}|)`Nf3-d=iP!Wtn*2MWafUH`EQ6g>6LyDySqFT1jG`S@siI`xl`}S(;T3 z!=$j~jx0#3wo(CuCoZ5HuX?W_Vz%C;H8v&+X%EqU5TBdAZTH4?EzDhHrWL<0Tez(h zH!^l_^?CCrh z#7fOMh;_J;F4IOm|Nf^6#WmN~a@+w6>wx}Oh0*{3@cx%osIi0fzX>)wn}5f@R@BzX z+Q8Yu?te*?8BGN{Y<48ywK@!revQS6#GF-7YXUd>L$Z+-S?x_5yhx!!SIUUH8%JSB z)XT)J&FAx6xL5br(+4Tr(4$(_#oFds&UQ5M9?irdJC30YyLNA&Q+{26&umPzNy@Ku zhZcY6RB2|mW8Al=&AvU!35?Gzs?@V^OhRP$jEML)O}X4X0K}~U);#->B!FEX|5GG< zMPPh_w7;XATyI$<4$Oxcp{+@gc3Svw55K8$D}CQu*@xC#D&80U-u{SLa<480R#t zentuHeOEuuN8HWayP&5KwS2mFAij>pWjG z>qeQ_OWanQmIU0S#yjkbILIz7dDFS~1jmB9cICB?R*u>yjnms3-*-1?zloo>nW>7O ziZJc=*J8ufmdCKQ=D{-2X^6nAaN>5WhU3|)WDBv-mk77OusP+OC2Mt^MCA}nL9Cp- zo04812DP4p;?I9tx<}HL7?PWk_V0uW$$$63iV(nHD9JuiRGT1Br8U>;AaEexxPez3 zCx0ZvLdEQrq?Rttaq;6Eljah0%Q*A|XFgK!<{qy<+ogM#qjNAq7o*7|XH&`EKH#7?`gD#QVYBUk(qk zz9`4;3hf!sIlvh?7SPDf)Lq51l;-%{;&h6%L&ofNJ~ke}SOurC--YE zaV<-lID+Tqwdd1yE0}xHBBeGR_YflkN)th4#lzXp_ZhjD=sQhHp@2^^2w0st38IKB zdn%>`=S1I*6>WmXxkb~OLG;pIMxYJ96EjouI{ZoF@u!IsSv^Fod*G#<1;NJ3=T*LE z(pg`Haq~fb=Ar_qvU`U^L-A01`=UT)QkT2$`HT7W0Pth@Hz5G+thWgErrw5K)J0vL zs0tck3a_tEzJX;HG*$*k`f)syYGU^`jct1pg>&pxUB}`g(4iKnFTuh~&1MxCq!r&O z_3Cgp{!-tstgm3Jiw&--eA6|oA4yB&RF%Gwgdz_ux+OYZX<*LzMlD5ctP?kKqWI6m ztgL7?03PNl&O;_{t`lN}F_$}{gvLk%kabn3~IES6wuQy8^(I^lz4rjY^hc13SG<5EgW2uwa6 z1p{d^e@`;9(w-ta_}-7wfC9I%H?RnRBjZBORG}u=IbT9l_JPCRV&y!l4xec3}EOv(C5{Ff1qKpVo0-> za_D5&9M0ch!aSq6+nWhyjo=*$V%UWC!})97DTh!ljmpHtY9FpMr_5kgcpzZYS-YOZ{^U=x<-Y|=A3Al3}IrO&qDg!*;AkC99 zUBEP7{fvM{8wYf;ZzX&eb$GP+0+?LgtsZfrywdK-siNb+@%23|X{uLh#G=Jpv6?ah zW((Y{6uqn={+sX+?*za7O2wq`M;7?KbI|Ls7$tM$>Cs;a2*DJb3=4&ZAD1z5M0=B3 zuzuAY@CW+dD`QXAth9a++G9cID|KSH$^Ae(GFlr+Ow-7j2AfO)p13oRceRaM#%XR|3m;I1F7+*H(F16O&vFs zfiNmX#SS@amC4mHtnKyo*6s^PmKe;{SL_U({j-ezl`aDcxU{faeCvG=$&@|k(Usmd z0O2xUoU)AZn+wXitifk=h@95mf@y1{XOo7PIxomCpLkX zDCx&E8NmqQCb)~jd175tCjyy8Q|35d8X%zcE~P&L@JOQT!HO?v#$2x#F2ieun-7Xn zdP`dbX%{GwyF20)ub5)2UHN#*^CPHbmn2(SxzkjedKdUO-}M`Fm@6r)U$Qn{S!*Dw z-81x_s`%Ta?zUgq^`MYx2;C2!w}Li+(UBUJ(C%x?_`R`**54wQ0M8S;Rk51J|M-MX zr!!qJ7j)-68q_5&fBj=RnzZ#ekzL&$aIRGR^!bw9x<`LU?dgX_Nd6*-6HX$W^;+iB zN!6iw6SKEu_ZzJ6TrpLC2zoLQZ^$@vN<`bNX2+~4aruhrglJ~a?g(nktk$+&6K^Q% zHW!#udvOM;O7gnCumj`r*ukBDDX^qSC)S;X2ErH+{D`1bh;0#6uO%PtP?AMs)aIpb z4$K2ZEhHz=l)y@Dv4g3xQYvXmIZD~;C}%Ap-f*5c>^oQVG-up~J!^o%H3Sz$u^T5$ zZkMQ_NXL{OwFmCe59fnbACrfDSIrUX49OdoQId?X;gUBhnm$yo84h0a3-q5%@JCEr z%Z)XmG71R*;FA*ofbGA`1kQGL)_O*EHueUN7Eb>c23oYX9XHz(dta6Eanv+^5==-B zb8yy9h+L7#v8nHeXQ$sQlrY1CNu((hhwwKijZbWEUl3jOZ1ar!Yyn8b6H*nO*w7nf z0tx27U0&C=_lOxK4RgpVV#^N5BnqXo$sa?cT~dd&wBqn0UfZa8RxV_tjdL$PYijCv zWcS7#3+@i3O*-6Dir-T3VwQVm)4srK55-?D zsn6Holl0%$UVSmz2y_&RLcgE z)-9=kcT7%U=!r9q;}a4s7?69?((GQ{Yp0AYbF-kFZsi5Q7MPYaSQR>W?etVN-5;7> z%!rO(SBwh-wRfoyGpgVr!qwiBrh_o$9>2;ieQDk`euU556AiB|5RXJFCwWtuKf1o) zD@v)3J!}b(0@<9)?wj$@i_g2eI~nnYV>0OzN3&yeY`Lz{5T;v?5xh_*yc3Hbqigw) zP;1*-UtTAsUtZ-#K0_jLcf94p{8uq33LDluDi199f&{Z|pGSUwUi~839n|$tr2xx^ z3G1ov$mWEu>Z^`qOOl9?`p}?)h%x1$IaLb9qTcr%=U|EF7dC|$<%Ccpbh9C%qnr>) z?nUIb`>h_Z)Gf-J2m+ur{$A1`(1?(%$1-d-wCSR|=RqY}LvqfD%5BppvZz^%Qme8^ z@CA9lnwRlYzH_Q-ly0g1p7srQi9|!=9{b=5Q_U(jh=d$#^4@GTtff;Zeq&@INZ`5}_lbRwiHX@H$kSWMPRY5m6}Zg*F)tOL91VEnXBT!GK(IG1qpN@A=us2V$Je*)yO!)cYAR#8kGu^JGn| zztzYdGNwW6P&+4=ND9~V3@CI3A&j=4ct+}r>r`}Uc-CZzVSCSSro1HB&F}etzaAB5 zi)zFS6@cE%BcL_nCqq7r0TOd;As5j5NDk#1va6IDqDpQLG6{kQIo}V@^z@FHaw>Va zQGeHC5(WMkiB4_BqhYaVo!&*?kY2? zriz3ag%?d9o)r9&o*-H?W{y&R#`M6ux*~9$KKg;n(UjzeO*U+`6g&xtQWli*U&Q%Q z3lf&~eV8wzO>7>gMR}y|?O;Kw_L@gi9GHn%43q?h1#kiL3^$Cs21RPw0AvaUi=#H+@Q1DMr=i zDO!OW(&lH&JcKo-L7aeWHVK(s*m=R2wjZHRC6zmrBU8UiYF&RWf?QESg*b^(`-~Rk zKENdG*DZ|P2e(`(@ZT|jctLwE|A|H)jpG6O^6DunghW*tpjk(-B=^HZ^UIMaxw*O> zZ_m#MySba&ThJK9Z-IX$`yL_DLf^mfD?Q_~Ai`Jns;UujCN& z4iFf~>PrzcI=sRtrU+(<3)ezrJu-$eE7OF`iWa)t-SQ2RNkEkZZXs%=IuQKeiu)YB zh)4X(!1}=wBZH!104%P(U2yh6$k3fC-u!y>7G@wxAWjBMx(bh3ocF0tC{1 zDZv06*;)oStema^Xm~d7-rf$T`>|^wgwVLh#N=YZ(am;)FF^wt*(y(s=jt1}k`Y{u z?{t12#f{=J!<#RzQ`_tv-0A|BBSAmppU3w!^?z_T*=>V{XcWe#Du@*;nTQ zO_2hLsXAu(fRNOdmau@#|(kZ@tEs`Uc)9^+Rn>wx^wKEy{ppsE)^wFyYH8&a|{884o6#I9xOx+$}Rq2 z=4f_jA&w%L_pe|>&_*)#`tjiO%f>cHgd-z)^|R;)m?C?H#lF6(9(T3P298SP@@yE! z>?8n&o1H8wAPhvD^lV?CO_zQgnkyXTO-u19P|X|teS%x2((2#m+r*{a_i}}QZ4Zr~Os<3M$-pct~ zZH7$l77L)3ZLGepWtLIFk2oQ^UYM@3Gg>|M1)k_&9P{YQNsXgf0V=-fgg zL!UqHmkA?QWYpeo-2H-azBtr|IVY1S~zx65zH)0dnO;!oJU1= z`tV8F=K{D^|43vUZdi-A7277L>Ja2*0uAX&@X(XDptj))GAYhN!-|q|mQyzaidz-r z0`LQIkb-D3e3>(a8X2^ zn-y{ov}_}nBI9`}$7)p;KfX`Lu@Hg4Vtm!Kciy+?XgDD$x2o*m=?$NIyze6pk)axP z9`2M34ddNe9U$arrT-3kMYoP`CmfJ8HvRS9O>F3Jpo{?$Y^Y{;Z=$UN03>-9eY4Ck z3+yROy?-fd->pmzFAq-cr&k8fk6ZChltrY(A~Dm-odFo({31m=aUd!}q!z>0?t5qd zq@wc_8*2sf*g3PZVqXre^xAciBJr7sQ&nM9G)-7gJM@9cea&mqLO7g^E=rsZ^Ho4I z4msgRdek41cF*V1vFbHl_k8YF5)WKj+4LRClXZmkh)f^{e3FgZF}dn*z&)sp6TmF0 z2UOFdyou+9`N}SWZ5ctN%660z3`cuoIcP_))nwxvss~GHNn*|d~2?`tK(1|ZTdo@pDV3lDM@?uidpHS3}&eUH|LYNb~f_d`tq=Z zl(+bbeDQ`Uz$#s(MRj=On2&@uBsFV?p`eUd6_se`X`?0}1!CuQCU|t;;So$_9uwam z`TeD1!mCK^m$m&Dl@9E~sTi7S;XQeTJA|WwwcEKSUM*@S$3|JbET}gsRku*`vG1Jy zB_l~++C9&L!O#v}5fjMb6|2a6pTi+7MRijoT#u_i1R8l$y%r=nEXE;h%Fv#`GDKRe z?|DsnQYy(HHyx>C-5c^r+*6FJq$OFU`K%`bDE%|LIX-;pTAt6;o}h@5%!H8cmg`R~ zWF?r6@*~w?D7N|LG~MLPmE^JUv;^h32-)R-g1L*OW?|$YXvXiXdp(>&o8usEA@yP@ zA$Zu-I$DtuJeXsdlv)^h)64m`LD;6-IJ+1y&yvEZnF`h9IAuCJPQ5A&bLTs?E_}1z zDX@J6GgT$ggDJyICn_7afu_J3V?!Y$;_bs*%zKJZ?#f%>qIpS$dm=uKigU#Yn^ z$nA+0MOI#cPrHV+1vll&-&1FB(u;cTe0a=|9pbZ3q_?W63!5YrIBt6|Is~4xy}la8DYwLODjBq_APw#42WE_Wlq3$puFXyy z7P^%oeqXQli#92CDy=-}vYIJ|-Xq0{ALcl>;r%E5;CgefwXi-(k+W3Lj{;|c`)7J) zw8g+1mz>$Gwf%u&XAOIycj2{N;2ZGY`Vc1mqupf3Be$4xt%HzK0?Z$xwk*6$gVr9wQrg3|2^gXW*Vrij zd7u>Sx4{*t;Hi#Xf{u9g{f7^i=)e(xQenP(1Fxr7*PXzK7pF55_%-kk^_n!$-Pr8m zkkM_W5AL?R=JxuoUBua+>z=8oEQKh^ieWO(V!6FB-o>sJy*qLzzy)SYGIxbGlf9K! zFW;mJ(c{~yc;k(#IjF1rxQ1MF3}Y<-bGt*9mTNOm?07yel31nd<{ibxATf!`?wi<0 zoKj-^bL-JzfMmd^%qo_7d2iV;bv1{074kc{HUIs%oc;ZXvcp$pvsGv1bFmCd3)dUU zT(Q?Y=kncF^vsu?Wxt{Ug8+Rj1UPW-)bVF4c1uYfbxNux&hF|Oz8{&KAS&~^C$CLx zXg@_!nwuYq84PMs2bt2+;#>ExS6oRdx$*GVhQI5(wJt4HVk#nWae5pFI^3j7%2K9r z?#`Ein(th2iYE--hEzhHTod=X7xvOs;5HC>+-L3w&d<>hH`)@oxv6C$!|Jri9Ef;I zA#6)fp+)U0ggd{po`7Qdp1PP9{qvYFs`37B7Nv$wV8Mr+)-+V`!pXZVoAtuI|~xxm^(-waMY@n5qv4)*0dFBV3e2bRs|t|lJ-6x zKj{_7GiF-KMkhSS8o~VK2kMb7?oOtc0$cozA`2*rl&z^9z@v@*(*k2cl>kn@CL@>l zC;atT`!qH=<&2BzIk*%kc7+|O<&D2(Jv@t6ZhjQMit#xk|R1{ zoOpRS{J7Dx-jiP;UpjMN^+FoZZSo5qF=SYf|yFX(y~%v!dY6J-!p$(A(IRsT7)t&pEP$e&83Zb z^C?Vg^HBGdJtSJB*p#oo!qn1U)-E~hT10fEc&Vw&w0H+84UZ&$xZ;~dCvZ6yW|!u` zbo|J-EU2O@4W((* zMuLiHKs7_~+sKL{#@UJJ_|4dWG*jBunHvOOMdHzllGKrDzMxMQR;C4YbT@XDs7n$R z2{W>qzhjpdI<;8-`%(LbW<7IQ?Kwj+FLk#yfRzRwb!ioSqWfBIBs)&Ig7C|#0Zye% zVUItXyvLr@vbUE0ziKTD2XV>^XVyEtdANaUAT;svdyZ*GF>Ul$waVHjS5=*#CMcTw zPNK}Q{D}eLIFVvr&B10xBClp1!(I6EJaxjckExE!2u#&;!d|dVR7K&wdfdMADZ@_SO`k>W=4<#5#i)W)o zyoHWe%p2aSQ@yRASxD^QANxn9)`Z?N|MBx020i@&(;R#cg^n7!p!miU%RpS;3p+d*h%*v;T3X;wE%0TpOD*SqukwzhTDt(J z^+S^nuQqC$Bxw=^iWbgnpJ4lmUGeA~b%Q4YXC7Hh%ewajJM~2;>6jAQlbWZ^0%f51 zy3U>Pcn#6pi27hhbjqB1U`Vx+4G7j1MKX-5nvEF?kgdi!vPv55N4)7wMct{@BiBm? zb7e^hj+YpqqEM4K*4hsnYigVnj`pQ(ZDOdk`+nYcH%JN4M--D2eP(h`y=ReS@9Gvu zui9rO^yvs-f2*?IUI0XdyxvaHF@#i|W>SW-?$784ftJ(je;@WjGwG zmmGA`f`y6Y7C8<*D3(Ar@fytV-Lpjvr(4B{-69YSnC3cY9voGZJNhSB8yK*w52_bp zSmHJguLpk2S)!MU)tSH<7>7wV{JzPZuU`$8B`BfeJJ2wr*X#OU!HF>l_)s{_8ZOL~ z{^e)@@eENDfT$fjLIffq!ofS-FI$GI=v;hw;NHUbKRYkxZ(kVvx3L(_k~m;Q!3teL z*J0yZ+c(tbWuS5Kb-!P4`u6fl?{8UULF5eXr0zRnrPVh6HMA$s zN_SVrW^Z4|%!xX*p8*OPfEK3$6}XD*fHHs~{UOWZWL-n2I*t_UuNk|jkXDRfZ_yG4*lOXck7z zQ9<{dY>Z%_PYmEBuSL#t{QL6X*~1^e*RcTu2k2+ttW|;1ROcPELlpo{C^CTm8k1R5 zxs9T}c=k2<$!Dox6}oNdEWWV-8mTLD^OlSGk5ttFrDuu!2~Q%%N=-|MujG|hCv>0o zw}@yDGO!@A@pQHFwATV4opi;Nblv7v#8#+hQYw|LmF z=$;MN;+ZP6FFnE_D_?IAhYA@bvWADrPA_r5k&MU>1<$PIb<=Z#2u2?2EGnH~@+A@= zfD;O*f@#d2)W9>%yesl<74d7BJt9;S8ZbFBq;n*dE{fgEZ4ga0<;vO?*Q2#j42MBg zVXnFvv$<=`j^sfE<$^moYIaWmF$UY|@oGD8SoRohw2bzeDpC}Wk5A3%p zQZRo+=}C(b`X!2(9l#ezRge~#N@*?xP85fsnp`5DGUJ0Hmyusd8%30KxNomi)&>z} z!udSt9GHOr(GOD*NTO*6+Z?=E0>cD~* zX=vKg(plxOs<^NfOUn@`Xd6&gNq*5$o7lk)PwepaZsGaDP*bW)7*wHljRn8GO?Z{9 zTV_;KDU)XC+p;#Wr8l4p9yKYu1p4+Sl-tNH56La>^#a%d!6aCL3(dd|bQKPGYc*j~ zWe5otb+99Voxw1ga5mw;E-Uu^OmH*-iltJPwlBnxncCBP9$MZhKA<}yD|f3*Pm%I| z0lPlFSx120tqoB&Mw$O2HID*`u!$jRfyGPrqB(Vx5&oj$qlRK#pEDTh?;HiirJM$#j zHt&-3Yp@$cSse)QP=tSZB@p0kO|kXK1QHa`{k#5t*6O>8<;+arL#~F9Ho8MY-NO(* z%-#PDkR<5wH< zGDic4V99P`?Rg9*lYYP!4KR;G3)wRef=KfZkx4P5uHiR&F@^E^-`NdUg8LqfMXjmv zO`rC8zZlu+cQ&cMlDfF@xmY&OuBMJkW$nk6jH_|ghSCyDmHWTz6Uou8dc^_6#>)1q z;Dzds&Oar7yL=Grl7oBIV}kwL=ovU1etks?iDy{C=B$?>DGu-T<;vKKg2BG_y;`Ol zM#0>`#<2XCQa_5&q!dTILFQTz8R*8{1?5*Bp?v5`6{Ot67T{95A1y02IihU(HIL$P zbrJBz#pc|ZTa!+8tmMMKJP3T!(*iOtx07L7v?3Hko6PR*eO4Hk`U+qP}nww)W>$&GE>wr$(CZ6|Y| z=Bs&`nyEVd8&22mwbwpihazA|b~)kwR~j-Tl5iGv>*hPm8*rJ584 zkWh*o%x>`tm>b?e){0zrYa*O`&V4Kb(JnNg5M_o0=JJ^Y{-%WS7c{_!jgig~;BxH_ zEc=DthS%D$UYoBo7OuPhOQqdF$jY<`B{0$R<=@XBUOy-InMg=E<_q6G@WbHZ78 z7xJMGp*O^1yh5%605d*lDe3iLjLVhx9Wqx+qogdyOw%_6+W>agDzrTG;rfOg##zES zrnDPYE%6;}6lt9C50IuFKmY0I-spW>q#Xd#xk{^s^;;~g+1b=;?!au&1A4+Pp3F@W z+W%=&HY%8|50P|H!~RR;mrx)p6@l zl>lPW0&m$f`NNVd&q`_|St4vF)nJsKB0;dIrC_xj?u19z<@ zeWg@@05?aG8)C?|{403osqQ4gmn9^kNj^G{E+e?BXD=q!m1jjsD<#dF&2Wz{wUsE8 z+12U6-(EMHJN6L3daZ*&K>?e$Rz{FC`!!?~01FPs-w);uc4qIc6B(CSzgqsTfUn}u zmr0!pya#S?EA5A?h>MnI%W%Y&nF7ii;dAHu7f`aAPa++f_3>ltWE0nT!;vso)wpV^ zfr`*85ZC>h7y{>)*TWz8MpT2{26AFnm@PWP&;Sn9VlkSex^M$Wh09Mbp6a`}68YQ? z0zX}B3;Nj_0osSADG7kMT1CQI7ZZ^alyyaQXA4r}5Nj|l6)oE#08RxyuwET6Af60| z^hZ-L8PoC>z)NJ{i#$5er%#7x-UH=3AXipJC@~6pSNZevj~iOem?>+qxYrGJ(jhfd#EFY?dbrtLPrpxBZ^OgG zlN~KpoB#^3OPGSS#_|!vIXo1R$s^r6;&Pk zx-UFDH69elL&h1?(bCqkZQ^t8<(zf)<=<)toWv zbRy|8rSL?vge{9iMTwRe@1yFT2veVY7hAyPp}MR!+y;p5NoRG^#GR^njxk)N$gall z)nB6Wa9wWbb=fy>6@q}Dz!MXTq`R0y$cJ|DuY{mN2_si}vHVwB5GxXf_>>8M`HSSx zm|m_4H|@w&oKLvq&>O|+3+6_DQAnsB830--1&p1X5T>%6Kag!C9(A{K6c>rG z5~QaX@o?(O2i`wy1UN+@0{Fwo4xL>DiQl@K^CMx~BVO-Mrkle+z{V?OQ^awNZF#() z&)A|U3m4HD^!ya?ySUeq;zA9o$@Sh(wY%&m>5(fZSO19^2$M;68 z5z10vTdvtes5=eA=1ybvv_#UuPiiE(MRTT6@OGWC|Dvzf(vf4zPt}ti>l9El!u20I zB-f?|kiujvTO#cXG*l4~1StohEfb8AL!W$p%dtN0Zxq2<=%$3knYci!bl&gWys&Tk z(t>mG-f`L#FwGh^_xzogTTPIBg)TC|ru__w#9d$b>b*vFb+bnYFDXew(jc}lg#k5y zp&bX*Nip?Mq-eoqzr)ecZ30j@2*F}j^+JUit8s+x1WTndk6M<}aapfi z%n`crFlh3p;VlG|dT0@wRg(>K*eL9e~|q`8=oC44#-7<#;<|9$y)9$r6x=>__G z8!CjZzYr#nW@{|~eO5jIH9&9w$rkW_>i6`?F9gk|8bG|=Lf9t}n z{XV+~KA}GY0rkO(+N@Bq;#{6$%R8;qfUmW+F=F`iuQwIo#PGj61U2a_-totdYOkQd zyfNo~(;0qNe^;3}E9~CKTVg-`O3$0i;r&L+N*Yv4$a;}jRrZpZz3TgjH1#!xNlt!%)~j+pvE3-#hL zw3|0JI%KXuo>nDsU=E%8z^?UFZ1pHsrdLJItem$i7a`2Wrj;v{OV50-0Qdl}l;=$) zYhb$%4(pBepk1uc1EopeJVs7~986zNZEcB7izfFVAB;fk_fM{@y}x_SywRBG@?h7X zIaRThg_4$$NK60n`2}MTZmfn)`>dpd5IVS7`ZC8ZbV7O1*_(Q#%Pc;LWgzqM5YV)N zps8eDhMUX5$)}7}nh@SiynsAV+m_o8tK#omr!4d+J!R)PS2&C@Xp-jT6X-c@0RS&? z!v?uc-EAO}NNO-=m*7`mN4oFW>N=VUxSMX_^;yq;byLKlUnw0q&x1{F^@j6?G+I$|IKbaya^;rX*WsZ zJPbVB=~TjO^7ja2?`iTa-xjB=p|n_PeI=C#O3FDNkXUWNsl}d|lf~(8g08q5#hi}) zc1{UluBDY*cA&X$p7Ak2NX$F8N{+~1mCh+zsCxi;J_Gh9b^}Ed(~M>&$%Yv6y7Tbo z$v-3zoeQ1f`hc^HHg`Wq#fMX2ckM?Lp>}OWiHjl;5;#yqjmU7u1qE~m@mETw%MeP% zJ~0IGd-?5AA^yUtvE}u%}ib2O8l96uP>6P#2bVSh->e%s>Q0;Xf zV8<;B91|rY6rSt{F0k`W-eEM?6GO~-(^jZuuCovDbS|ka`OAEfPEk|iwr5OdQ~XGN zdfy-2$=mU+)x9!rn%j&nHI=S$o5(ijMhCQ&E2ml{nU5{|$daXR3DWFb?a+ACWd z0&pwnMlO3yZV@MsHIRp zb7a*6H267j)p=>|%m4U^y-qoS&teRW|QH@oJ>_k+}lIzULQxkeLOZSs1TvY;4#BSLTH6Lgq zQ|_TEsDOQXjqF)HsB+uuD<;p}}& ziuq3!@}2Bou>ZuxQ~w;j5gnlYkN;#*NdHjC|Eny@#?IXQf6AgZU9s5{Zog4EuBQ>p z<7k!8M~zY>Ep%;W&e#%o_^`!Gfn9@0s?6L<=V{Z`o;V&^Q%kmYwLJV|DYODrWeNaQT{M_?|88R=mgM6 zeP2F4f;^=w1y`H;mInTd&Y|-+IKi3avlt+ z^=D>*q25T4A`ICR0z^p@i7+$55+*ni^3LJge^Ti%{={SxASWGm2`gDf>e(-K}VuElI%)Br`V%?<$(@x#<~~yTPY)9 zFW1<2ZFhTh(;fm&QtOGsKaiEZA6XRUl(x3}`tpiB<-y;k1P%7p)w2EA&o2nWB<>+ z${)haBvU1gv+55L@lCJpob<6%Oe$fNi*(=GRqMLDvxkSThLOQS8emqK1dgPSv-hmQ zc~Az3SggZ630RMRVqv82dfg&uyzQ}bRa#1isXu!F$RT;Z&yi`-_|WeA`H;f1*B^wLda@<;QqE#X&sP9B$~>{urOeJ zQXlth)Pj+T!};eklgm9vZ(cr(>?L>vy>b(EAuPgH^` ze~t1qw_n$qU3zBTF0#5k*Dta4hi^>z%+E6UgVH~O=r)1(-|C;zB&;?UdnVg~8nGW} zSZ=KP#va@-B7{QoiQJxGozqPJB)VdPTclSauivkg!$D@X4hN1Fr-Bke)x}d)yk(ThbG_;;vr zQ+)VqEBi-1O>VxpV*P#PaYfl3rwLcA&X5pg6--DbH53%mFe(!PWrMUgl-1J1Kn#=? zHSJTV3-8%?+3xPnac|zdbQym6DeByWckBGh{F%JDxtUD$^6F$*yi_Gz!}0^+QE-=I z4&L?VgS0U?d1L@-R^RDWo^BcIf(ta27OVmF4$Xkqgs?2}n|0GLXc`UY_R38#sfA0g~gW3%p?{*Es; zr8PdKj0LE+TCJ)p;;6}RiqbV3w3^Bc-&!54W;39x?lw>b61j*7_|r^NXF$u;JHj4l zj^UdJ_*I@)X$dcTfr=$gz@d92tX?t1%6GpmN^svMnOn8l8!$|AfrQFYNKCjDEiCRO z-cpMvwlW}O8}FTn%V_58)P_WoxAdfYR)C;8c4BA43)47F z8UX(tcU5f-aEI!Nfdw)(+46OHgFzVk+71_3MXxL!D3!X;DCZ1XRB>g4i30G+4PLZc z)CPVy30)D6%^!XZnEA!6kIm}DEr^$hLXv#~g@epzy1tE`QN0oG#8r|c?3;Zrrq>0_ zX4m4R5F0myTU0Don81dkCxFD1YvC&79;??frp-rcDn#?(1q_k;{hO@5>Ss#X=#i%0 zYJUc)Hmxru!KMXtv1Z#i^xcMG2z4jDt(U?FGz-k#ls(5oeUd{Un^Au>DYa_62{S)X zCfCT`M=H!qs^5wqluAC@-i1*cSo;u2$T}amEo1E))MSzgNW>V3O&Od)#lCLA925BK zPK9Or`^=K@bsm3?y8;Yv_d9*fZQ`W!7C7KAGN1YK%zz`sgCDXGTaCgBW zdPL(TNG@n^@V)cIE96cTZP2&)dR>>C5dPW-}`Pbh<$y; zZ4TnADG&gyrUbGVaCN2zve0u-#sMP20Uls6z;mHmWPlIAln`bFb>fX?DWt8H)_7D) zBBzz1L63{8hmD1U^TTI**Yo4=nd0avnLbuC2uSm7_R%SAMXd{I|JL1vVY~`?iBb;2 z(6-dj-&ejb_Ah?}@o%4&pa;cQ5h{h`i5E735!$uKwd|PqK<5kWdbOSV8R!{$SPG!s z?}7wPYZ+Muba88|ps2aHZ{c$mdsRw^0`t~G zx2HsG(J2mGjt-PF&QVGP6VIz$^ZZ!Jqy+I*@TH&R-D>Zbz-*S37)wMnLW#b;q=5Wv zvYb7<>lzyy?Lg1$G^$6SdJt)ddHWLdiB19AI7I5*@YidFGo2dsn`eG2S~;7_-;m#T zG3$aTC4<$Q*)x<;y%AXJQ*?e%6(K8}F^OOt*Z@ueKj&#c1U+Nu zfs32^cV>2V{Q?Bz6#Xo@?E>B=0z0Uea0a>meB}idl{2=FqQ=B$F2Mwkm9;OxC+1-i zSXtVJ1dr~yx}=Ndy9`ui=pH;msd-+0R%S-Ci?7;LJ}qQw_izHGBYUP+UX7DT7IdI$ zqEMFr5yP5b0`Zn}H;895rsQGX$Oio=)W+7NWopW4;gY{@|9pjKQ;<_9`*}A}`8CST zyLbbZJlm09o6ESBT9#ad>c!R6)Sw5dOH(di`=0DKsa~3r_scY;u-QK6BA!O$zRy|vVVR%78!09Wnv+o8nHFS~$R&9>>mu2M1 ze(Ql_#;Ug!gY>X21PXi0=PR;gCa30+vkXjrZ2R6)1Mvwg<`&g?yr*lINdroYtVbgl zL=?TM-~3XAYNIk!QsiBFS)>n|C<0*^TBlet2Rh(zvf}f8S1z0Vj(g%A{a%T-i>?vu zs5SBms3k#5yFa*o+Q)Wx^KL==u6iC81cU9)hRm9rb_jm#8+Q2 z6t5RW08|bv5>@0xJD6|ar~`T>RGWI#?yj+XSyhktJ%8VLURCcIzCKU<(Tv<7@A%3w zFWBDtbf2(DkThk(bv+|V12jRfz*w2SIkEHwyPTV6Kw>igG9deZo(PXC4q@VHf8!T=s6 z3L0Fv_=Y5th*eBRxc17SmFh}QLq(j`pjf8e+c|f6l|DLEEW7lbXj1NDAA*Y*b@!hJ zED1kT`HRF8L24f-Vkk+F?)_I~?7|5f`L~u9utInXQd$Vy$SuJb9kh@50@fOfM5sJ}>B*>IGSlY6RsfY4qnq}*PXFRh!^Snh9GJ6Y#iR&p!UYKf0{_tzN zdna?XQW13~)2+p~jW4w!cOq^ER8f!RwK=X{^ifu$@O~>~p2k2v%9YijKFuFivc#9H zY)ZH`;u!;pcY-JJ{`vxl^;FZMueZ)v!eh|ft!we~P@Aom>0^@QU8TnoiS+P3cT<*v zC?BzUd-mscLq6C9w!G9(?c7XM!GuXaA(K~PP9`4h0>_!>UtTYJ@%P$0uVkPHpypMl zjtyn6#UeXbYJGqY&Nx{Q!~5eoCB!~ur@fp4u z@$Dlqbk{F9AT#uO=1@K{x&-AiHyU2NwD5j>S}ateDf8)BnR>8v#oYB?`K|8rZJ&I( z`~3$RKbO>(p@YXe4^&{Z0hE~EL+f$=6 z#;Qs8AMc@q_El+8Ql&*7(N!TaYtX0yN|)HrG-Cfdw7t*dq+FcDuK2SPNgy-iYTNcl zO?sLb3g9dnpQfmhPNRe>ZaLgl6+5I;njoYvWL|;YFG40DYjRO{6+6I@~xR(+aNS zPgvNsT2UE)B>0yeti>?dR}j2z1uERi5(Idr{03p43hkYT%!zW{P=(Dx!}N|i(TFEc zoAni&lkexQUwtn2Kf`grN@g> z3f^$kP#yFW*-=Vh#Tv(DJDwM;f?(~UjJRUc+x|X`z%?!a5AQ6Li0wrNqnRP1fN$!H z^c8WOa;&2w0MQXwobIB9C{OxT4_aMX)xUE|N*_Z3n=g1$z|-jRxloglSP+4`LS4&2 zxZl@*hk%rXfghmZl#mf!^3!fLc1q><-p*RZBh@#fIj-jwTp;GycMS})Ju8RST4Lft z^RsQLNU%z5d!|0>=>cKoqhP}L0-$An^mb<0#rHbY^Qm& z1|94r-5^^Ue5B@e0xw`*J;^*wih!AOJ>y}jrmdaHDzQrhr@O-8z>>q}aX68`FQ)jmIL^oz?UETAZ{is< zUi<2Ry(2Rg?UR`9+C&p=U$zmbO_r%~4_+RJT1OD!stBdSu$DbnDa9I2mg`RjC&9oS z9|d7gVp%}zKBRWLjBZ%A7cG&b*Q(F!0Y6LerE$TT?>8Urlcqsq4XB%Pga35lob6c7 z_2}Erk`IZ8p^n+*1v}mKqVe9JLk93ncsv=}ITV_16~w+pvD93sy&8_Gn9o@@3`YH& z36VFW9w@-q?k#6$y{&SqDB0Wu|CEWhYL|U*9ZW*}=7sDJG8~H{{C#C1wG?OCz=vTYL3F5_Y+1!GWnj;xHpayM1{bD0mL8J8#Xq@D0cgVzMDY%AHW6>6s#p3iH%&m6Gon?K;kp$mOl27kg z%g$MgP108x)uAF%D42})k$`4dR;T7%TM(o;pDqm$so%whKzsh2I7&qm(=P3G++EwY z?v$%t*|4Ut&BU{@uv~qPnw3&R##6I6p7cv-lX=Bq<@2WV*3*Qub~zmbD1Mh%OOM;| z^{{El1B=AUG|*&@ctpSQam(fa2n)WlfCW3-?QN1pEvY^v_=$#?nF|sOBZZ+ZB~P_I z2HFrz8@bSHQ{=C@yLH1<9^@#qy9fX=MRN>Z8i>zJ=Mutb_rCp@o^t~m*o9{NwFZBO zmi6VAQcCut5cdr-V_+V}Y63uT8`WSt0H46{lf@XWo;C8VbOKzFm14ietVD0Oo-xu=C zr-6)#Jkf4_3vM}Sm6|Gqvxrf=q<{b4R)?5b`y%(mc*sW-pGZkY4~pR?)GRO5q<*4# zDhMaw@gYk^u%o^=+@HAgtS~@n1^U}MxQ3`~eCO{|e1X->Jjs=ETc;J0uP$>OAaZd&Y1oJTwCwo zCMVM?>8qlBWbYk@YEE{nvx1_SOTfH4rnNi6oMQN9{Cl=BC0G6m)PJ%?A`#IrhxyrD zI!Ax1Tnv*I`&O6yOzu&h z&~#sQ`CLw+7;RLow_RdZ=th)w`kZPowf-2E`ynUoXGDdOz@c%|e2viLEX}@}DPD&3 zxG!ANW?NZ}C$LXNS_KH`_0gRar+1bC^g79Oh3>7|3^gGrTK!3Jiij5FtX+_KS)Iqh zYje2L0diL0@zNl)LxjfCirG_rTuOQz>hs4zXW38yz$rDfq_a?;;*6Q zoIX^^D%t{?CLlqzDj>xka+4z?vD`j%ru+r-&M;NL+V;7}21zI3O>{s!MBWU-UazXh ze8(Y7e9cb)9r2d39WG?kF5M#h;=rHGyCkrPCg;ve$Zn;}f|2;;-F93uY^0 z$d)uWx48IbRsHn&2N}c@= zFgI6)-Ss~Yh{m2XhY0;l0mMmsHWaQ%7I)0zEN&ljTRZ|B3y23!5UnXXgS+uDkPS!o ze@zL1oCO;x%@fbN1amcdwHI{m)7bi>vv;$;>-yi^X@+{r3%HPS(~>lt6%G1(=S|^h zZGia`rc-Sm$He7Qzi=tR6Wc4Doe)F9p%W%NlFjXzIe@5Sj8cv5_rwKgD7%OO zS@L5e$OVzq63pktZYhPY-!R`_(x-QwQ>_R+@_tTtk1EJWtcY@@H~}9I1#s25r-TCz zA#L|-F!46ra^YXZckp$JJZ9I429xrKyusBi2YC0{#;*iQLn*SOW_chyXU=z!MdU|1 z4;g>q3m-rY;a0s>%sFMA8)GEn7*UqVtReh>bvekcisHx(nGLXJByh>RV;C~hunDlZ z`Fgh@sjT$!oyN*0>7aDmBJd|gPVxE%1m4I?*qn)om0}d zi?qJF1lhIm59p1#LhkuWL+b>6(V==2P>{?C$A^Z|BS>#KUJFJc?i*BgIphm-u>oN< z=egQeDj57v9I!?tHgr|a6581DS}bHBvq?k zt-#Ts3CS1IRg=I*JR*$Z%1j_qK`k2&)5mlGEFxiQQ*@eI*qsH(?0a%|J#z4mbf^3> z4qM5vc>6>~uB*+{&tZJ5-PXCE#hWVe&W25!KtD zVgAv_MXIyP`JC2ZJ?)q4%79++Cem0xSNp@XE$F%pH19uIy=Vz?dQTTd-^%-%X1YF* z*J+hgN!f!770L&dtw@038>1|Ot`R?T{2k68PfPn{_!3@>%83AFVaRw`hBxi46*MyRayw#ldJB~*lfNJA<8ZwOa$Q zC=SQi%d@+o^)x-4BlN);_XkEdG!}`qTOLKkE4N}+aAvbKa05iMK zL0d1~

lU=V59ut$@C2*UxA5xkb~nn;);{@3euzD(^4oB{nozoJH=H+%-fACdKm< zmXYlV$f(a=KbLoIfV4M|$`YhZcI?x^vu3E0Uc}Kl0J8sYtxH^zHF$vE$2~m%_Zz%l&+iX3#hduB4Z~R&Wpl>*Kx_f|*NPv5 zCR!80KyXk8^Rtnk_V+FR@7wOH`kQrfO=D{7wdU*W+x;w6ir=f^=b`22ebqCcXssLo zz1J5PDLf~$hag7Wm8T2cEW9Co>!oHw-egm#-snZTet7<*inN;ampTMseBqTao{6<; z)imt+;OgI7N^9|enUzC?Si;x{ZvM<`K& z1x{QrI?kw>wC%=Ea;^)6yBE&j2KvCZnip#X+j>)M?M)&-VTo#THTfvkt02nQ?b1?Y z>a*=^S^^0v6uegoeW?>Z3WLVHwO06Q_^*sh&6^J zY}iLoq@4)(4mj~LCa!cIv^L8pJ{su0k_QD>j&-5Ir==IbMXtaEl6$)f!r=D|>(M&@=~X6C=A>+6?}PR%io7HOcIIjdEfJ#~FB}Y9?JZi^R)V#?hi6?;Kf}P)1CzRpk_mut>?E!?*HrOhJIS0}YB%J!- zhfg_PWHth%qHXCr$RL(k(wHeqho94GB(5H^)r2$k)I#qUb;U_Ad?O1BQ)648_MLL% zn{z;uWS<0ztr?JIJO^6x1~b_w)g51}XRZ%+_brfx_gLdq%HM*g!c5m8sEA|lY>Ci! z4L7}-%rhwnQWN@1#ipOll)IB7EAQVo`Gw_nwS%xv&&$i4&D2X358SI0m)T28-0{!F z%VVT>Hts6yQL?6n|9n8potq5hWj(6apb-%;aMF|w@zaQxse23+XSA925NrDfVJzC+ zaUzJ1#(;;4{MpM`s%@xRh;;B25(dP+G0r<;aR3iSV=;o4MByN+vG=h@A zq+Ii@qen+KqrVQoo7s==E#0>G7YB<2Y~Ii7gs;NQVe!`}LHgL{NF{S}P>$cDzYcYT zVto*mESmurlIxUJFzH`iP}(ARl_Fv?I-`{$X0Wld51%#!IB?Fcm&WQ8Ax0ifT9!H# zUF)7f$3nFed5%dx)LkAcvc&q)Y13-@q~14pJ@om-0gULYN!SFzjE>*K#aK(=X0&F14+_ z<<;uKPr{zpV=EZ!B85IEer{y{O$HMAzF^p^KWwxD)rzL;qjO#F1D(V(BDqABL~xUg z#2gnRhKla`;Gdy_E=UJHIJ0|+oqemMK^P~qWz=udMjs`8N~A5)i1%-R@d@k!)cPQ7 zUMBvyUzW%VKWb=VR;C6M+7SD^zDLW+?SZ`g%lq=L{3rrOSE!S6<&lN?9;=)!6;M~Z z4iK{EY(~ryChDx#u)UOinKDSh!ITGPX-kWE1eAT}P!n{1Dc0OPF0i|k@EB}f+n$*4 z2G$9jv^HI2#|LOKLABhw(5}TF{N@rT4k0NFwWz5b!B{tx*fuxB8F5h)QC;wsfcS@x z*-P$Hx!^{~5T#1QUn3N6kacX9S$W$A=(B?w#VxSn%k-o{{V1FObkE>R`cuA%lSzLa zf}wFcY7M*vdQA9&TSCuR%*7_7X&ht03iY-D8@PS$kmAr29=cS9x((Z!sQiu9?6Y(n zQ@gFW7v4i6{q>+_4}yuyEI02=ph`t3u%int7Yq5#v;jdLpFGLH*5tnaoJ4E@45Z8@ zgl=R8&RoVIBlFJeK%cLu2u?vq;nz2z$(WVi>qmG+h18yG<3VtQ2sy0o>n{A)sWq?; z7lFAxPGsBn7S%spn#!G;rCKMlIV_JcKLYs%oq5x`g6H^}MkteWGa%w()6>yvq0p;_jBhU21_zx=r%5Cff|* z=)sc8p6egc-U=8Kl;bW%E9DZh3c<((6M!K80*mXnU)o4pmE30J0Mv>;8^vr-(i`)wM0|21yG zy;PQg|KMQ6lEWDO!2c&yutrY#t>oR}EH*MagjI47((qtj$W1w^!p z5DkT+Q6K%o)-E-6#MUR-9WYe3D0C7u&py2y(!%-^8DI?p!S43C%5$f>H3tGt<%E#! zBuR_BPYTC9LAS>0mHVvXnCU}yXtV2lHu2FYY-3$-k4OqpGb#ztKhtg*HPClvib~Ru@y*sY5r!_}KZPKH-?o1D!M5Uk9RbL9((Cmm#a$VCN&Ry+xW@Ey?&+Ct zaVGX%8E!fujqY|_y&e_t-{kg9CXPvWr5_S=4eqGQI5<4Ahzu7QXVO8 zm{odmmryHS9d{H|ZqZaU5G|C%4(b838NjYI%#*r$=f=x*8F|7PuNxBXUd zd-JZ8W(5oOxkckDvPYk1?N_XdbnUpxhXnPDw!V5U@h(#W7;-yblj?@dXAr_Knw<3* z&2E*24617b5a&ySq=n!Y_1M)4!Mjljwyx`UP;32iy14P+=#%jQ53 zlt}=UY@ZI}a(XGEMX3!Vqx>80kVHiYy_#-`X`XiiQwcFue4@tR^&7z;Ou_-^=~}Y_ zyR9lse!SaBF>}ofO$1?Fy>5Up;NBc&|p?E1>_mMR?pf@B} zGki!!m@a;UQW~KzGTHqYpHwiw27D#~4b<}E4WD=l4DLxEE(Co$ai+{G)n6}=of|}+o4VOYtW~Hx%cpLHrbCbAu;@* z(UJjHlC`9qC4Hhr$)?q)%I1mk97U;~<8E{kI8<(3iew zq74efy2#f>_rskj!ifi^ON}R z^^$3nfVcF5E8Ri&r7U{`Yg$ih2s#9C%Q6!|?oXi`Td7G{6AttQ;3Dp<5p1&n?*6Mg zI<;g^Wt-R(#Z=ri1Mzx`6N|ObXdLkbTCxW|%O*4G+s0oIunZs+!98BcRX-zQ>U$~F zjt&trs*LuWdMZ=_05hIAgHNrC?(-Zsk8#OVbI$&O6lq(!XT6~Eg3ZU6vLQ9M{qVfe zfljpUas(XAM#)9Mp2$>lq+{7P+jXp9=-#{5RVxW|!isl1j}SWO@h0c@zcfq9)_C+zZGT?8g|LA&Lai^bHzTLjz?PrJT0qKqLw` zp(Uv!793~LW7DY`wdl)|RI22s!SN%R*g0Q%1>Z95GU!o>tj2U+bUOeMk+ySaG?*EU z8uF)e$O?l~aX4%^p3CEzfqpl+=haIRCn}nAJ^>=K3kZ~=yRg=c(?~AFhka05RKTwC zra*=$6z0GcZvOx&2{E@snJ3scK8Zs&v1@je$LbSa3_^tdn_tPCzS_Zr)*U=0<-TNGp9tCw+;SRidGSe1SV*v`wPH$OoE4xK(M}zH?SXrK z@1m6yvXD6s-xql<7?Dc~yQhJR(g-N#BjZhiYg2k&@g8)waJ?GV-+}L%jD0UoU>V_a6Ekql3h605gGvI9VrEVgSPuUWEMf4s>CDU4v(xQsaDpgU9O_h z2*0V*!lPPGj8*E3w2I@Qke7=kU~3Nx2|i1!T}4h|+M&NlK{|BMn4w|hX26ZOJ_hF& zWhv>Ky@u5a!w>=)V{{$M(#^C`-q-ZRZUtwr$(CZB=aB zwr$(Cor+aSDwCe+o}T&EAI~2+*R%HC>%K3QJ+_C~k?IC*uL0Iqsv)A-O?Skm>X8jG zE3E;BJ#*!&ALQB$RSV;m{w%%ceI7C5P&FXU+AC0|#x`16^h4~qjyeNyvIHp_{64QU zeRRql8~1U~2Vd&%zZ^G>ecxX4zCY&H-W45de?F%CrNvGR5oG?h`D?tBAu-fUaT@!I zgx8Zd+RzViGf<`l^mLDQKYJg1<2?TAo4uTMh(Id2vaI6vvWB;KGFl#o$Ue=p@%dLEFB{z{SETjRuE|gUr5LVx=ajrXjo4U&3#-LN zQ3WLOUzI394BS$JXr-}u|J?rIW5k9-Z6+LdIX~G^%pBBt0sA*E2b$~jq}p5IUg<-H z)9`A^I%kdn$*cEf!C2`<{gKynV*7sDRSmmg7>8W|8LpUFp}E{rp)QpdNH(R!(xTET zX&>2UD7~$eDul*wi7MHA6#d%^Oh3Mg#^?7Lg%ksUlsLo7@rh@rQ;% z^Nb!&F@8g%<#UDfzod*xnEX!{-~gMK8gcHdufYyZRa9TGFID*<8k^1sgF`km?Hz4M zS~D}7xN%s@7c)&Qh`Ql6zUy_arrniHfj?K?%xZLoM7509$XMUIavxt23BjP-Vvv<( z)DmN-q25HC84#o$1b#$!ccz)yC(y6upqlKM7J_O7V>vY7WkkF8b*WelTw<-RETlnW5Py2{5fF8yNx`hp{cK++cXn;}7M&`+X-E zpuWR9ep%|yz5}o#gvv1ukyFa=4bd_ewBK-(mr6E=mIRTE#xg?r7Xi_mG$X!XAg;K2 z=d0GgoYh^pf@;XiVtg>9pqsFRweF=8zV>TTs$Mm{DuO8Nm;lbVUBwuit{0=B$beVN zHN$vIBXu+QatrOND3ixLZk#h_o-z|7GlBY@sz?tZ;RA{oQq}2cR!xzkLDi>0NPW3Y zD)NK&4no;iS^E5mog`h5=;02P?31UzRkBOAvNgtVqP0p#hDS{G|4l-*KS?M!z6Vp{ zCkdrc00A-oPm<7orXYP=Qx_*oW9OeR)Pr+lw>^6Q%{vk*Ca}nxBiEQi5jG!SKphUG z=?Lf$bL2x>c9b`oCe?Jh1&kmm28>P1=&PD(hWGOT~nOwx1q z13l1GgjUleuIbpyKG=+lOaQCHGm_{*OrYG?mv;jS8xNoNR%BHX`zjx71b9XTq{yUa zm!-bg^Fwdb%SaLf4of8(r*N(rGopyOC28LH#_d9{qd>4Dm1Jc^K-2^=g9EATDL^3z zTht!(Ydpv+f%|eFh3btn`DmuPlO04eOa`5IzfNxX|J}hGGZZm_Zla#9a_gp2V4AAz z)&nqY66zW~hGUtDGE1*zNopyoC16s^GrY2nP`wm}!}Tc#w;sEub&pB>>l2u@Oo6#) znE9~aPAyebuc@(hI|4wW--CV|CX$oD^+cAzGV1lxocVxMQis7!s~r=ah*;afJTm|G zf&%Cd0)U6In=0KrR3dVC$I{*;5d^psHGstsGNV}0HEP3Tfg~YPPXrh!VAV(=E}dO= z*v~r=-svp|?g|>PIr=387*ibTORQ73wb~h@QhDNy^+}H|+%_%l2>#NgCAbM`oPhdN z1tYZ|v6DCU6thvkDSDz}aMh<+T54&&z<7jVrE2Z@pEu=E3lcMJt~S`)8fEd*VfxLTK!NSH z7E#OHa2|ID`%ecWcdrT91N1I)h&zCG05-slkWP1bqbSETv4$ukt$T7QrCu10zfHCS zsB*k1QkiEmnYHtHP@BUZ*Q{?RCep;cr;T%tg>hbl|9Bp^=1xTu4?oVj!mh5oCVQitkMIM2-t4k-pR7gKA;`LlMN0 z%g`SA#CA%S{LDzz%!bRlDyEd2^zCN0)Imk1N~Im6eDXs4DOi&sX#YHlXoA4hcLJNh zCsf-YzOBRB(M;*Lq&|(R!D3aU1R)sHHZ^8t2G(_|YCn;}VD_&PO&;4(4r}1WP+!WB3HNMAcMq+q2Gr0TqnpefmJe4Aaz#j} zO$&H#)59v|dL|O*`A)u!Ch-+?ikB4qj76oW;3=+|8QtTeI1Hj-l*dCj6~t>WqD*2_ za(Kv!(W(ECE#f*$I^wifquNi9R`bZ3#5k0u4lh@L9NP~rs6Z;{1X}B!SmF(eMjG#u z!tfjCnqs->lU#45p4v~h=D=?~@fp_w@s{R_;gd}YGJfCLv>baBRK443K~7T&61u-Z zia1>?vBm8;ujr7Hu4j$tt!^1~GMUxVdCv2ePaJ`kef3K)VfSB&=B4id~8dd}S! zv@`_w_bU7?;!dFi&U;0t4QgU-c17S6G)j9v2zL-4NjS~XD9Od+T}20S>ofvLqt6yZl;Ht-EVUV9()e=-a~ zYNpac$r5D*8hjI+j_exm1+w|=qUzYmH@c6J|JL_39OCvH+x6d$yR;ZxwB>{*$6mCx zgo;b2GYC<$14+sTj;1i|hD8ENAtA}h_QTgCB@B3gMtjoa2zk0!@;q}}{%+$MBvQ)@ z1UPUk((n8>vC2X)TS@as;dH>_4s`w|btP4W<@f!*AzWW7dORDGUmv&_m|&QTom+n_ zYQm&EGnWHZy1}RFbKp>ON8oHRQk7Q046I|)g%Bc8H&9M<>tyZ%Np{1A%j^p5*!xwj zvcWuA2m!4-bJ1IYQq65*O!y0-*glUvvnkWk@w150|9*s|sL&(d?&jogYYoS|dpE@r}DKGbEGgDpS_yVC~xgRo$=HylKD}E0V`LH%*0Zc?maaf34U}M`MNk>uyH1Y3%&LGl6Q+X z?Z9xqRLVer$klLGL8ns8Fh8YRY<{WRdyDNmaah#yd|lgXT-xp&vmkti%zA9)p-fdw z^t{B&7pDO5UP6{F2zo+O`cvdCL|Uc~cW0(S>u!Ob@p~~jPgg*WYKegyau=_q2`Kv# z^l-9(@ts-TYr_nGAeLaMoQBqq6AoHke9$L@QhqZ(Tj2@N{h02>v2eMMz*$| z)b8!V3jud0hA9hXr7UUOvZ#_8EAQPLikw%E@Bp~Xi&OWri#>%mymglr;oPyVcu+8? zocS{?%uePOTWg~I+NkBFtYxq#AOWiQsGMI0C*xf^dtgxUtd;mHOW=sgiX%!@r8`Ux z81|Af6t+I>R=lWE0(cl+XC*0+Q$;oE0>|e3ue+Dtk!)c_#@0?-s@3ikW)0+6jh5KJ zzi)agyx52FWmSfxKhe$9hzduZ%+9{nXi*f710!54os7KbW1VU2xOo(1SGiGYhD|3M zJM-7rRkAshBpve9Yh1|1es(p7TJBn#jRrY6`~FA;?}PFGP$}RVE)j&qwM0_)gZ=*> z@HE*ENn~P@fU@exwY*LP1jO|}2__b%hBhu1`o`n&m_NnWx^Z zDOM8|fU2X|P)8?mgFL5)m?feynH-$b5M{vMu=^90obyC8(?P2Dr?sUK>&4xkC4ZR` zM0Xy4M*@(NGqVKauWa$mWEeD(Ajt&w%bdnKAsrBjqibYgkVw}zF`TXVHD z3*v7@UbdH-jM*)|H*2954|?y#Ft?=hXUYjF{?_+%jQ{)b_7>{hIGwV8D&&dPBoC;A zXF82c5|+a^Lxj{RKEUBHU8m{LNr)*Gxpdcp(Vl32XhZyoj`p)gux>4Dg!s%v|6Bq} z_d=E#b%7HaB(GkBLl69~bsQXK>y>5)Rh z$)az}IRJo(U48fn<&l0Ad~4gixvgPiCt_mk-{{D(!SMnCZ!iD}Ilk01I@z*Z?Hp6c zJDy@?xKc$xTeRr^I&SzDYNe|!-S4RfHeRa~F(1!g&!#%dQgdzUA_z{l_8}(YkhSYcf+9IJv0Dt)pvf+)1wNUI; zzLBk=Go>w4P9o&%?badty8;Y9?r-c-(!1l^ZzB}qXi|_l44fFwo6(Btf~;k>2VHXcKTlR;W{_aci?$SWQ~mw` zWp_kM6G~Yam_Tf%^qS5L9VmwZ7h0$^!ddJnoLf-T2fk7zJQ_USYEo12bXh!6cqCDnqB2_`9~s6Z zt#A$3Rdnqj6N2XYkqFraYC)~g$3}ik~Zr_;fS*L5Mm@UMQ{0O z_?fGUC6?-Tf8fd}963mKbRbSvu(}iPMP003HP+(HZi{3Yqwk^{p-JIXf!z5WH#;&; zP6haEy6}&KE7`)iRKQ!@M2seUwU%H5Z4SR|(pIT`!!sKs$OL6RsE1qnhZb`kafC(~ zUImPp5(Ou`dD?-e%y*tSH=m!=nlLa_#zMD&b99l)+N#I6=T9^6D=xOFOkPk(?p$|z z=8mQTcpLgKUmmh5FYDR+7<;*8olP=%B?!O+$laloBKILW0EfOES^Da^p``q%yCe`v zV~T-t_Rj9|?&IFv-PvMQWGh8H54%hI_%Ft3Bplz(EATxy_I3*d-YQhwv%IIs^@GxH`5GZT~hKb+7wU^+T*g+wj zg8QDyx3{aDLhI;y;ALwM70YuPN1ET*lL)H%mqRnv(o!RNj`CSotqb#{xrk7?!`4h; zO2{|Nn;HsclN4i$SaR!u*PUdOlLc5mfF*+AP4{n5w7^9(G#H! zMs#MI)?~Y})|9wt+KrbfUaI&JtT3hoYkM08yU8uYtyAIP!zO zVxuhyKpSC-lnHJPOp0`P=VNj4THgqYk`W`2EBXrs)VmMEbnf?pnrDHBT$v2SODKTT zg~XSs1i}_6K2S*3p^6jRD6rP=kKZ{TYPI<^Yv!|T>xAp9adFB$cU};I!MvQEwt+*1 z+L`q!4riIRR7NtLBFsm7!z}mYycgYW zl+PW8bXLE?n6shdCLUO5OnYW5{YK1^-9tN=hLHI5=1I(%{jREs2Ig7N$oLBHAG^qd zgrRvc@@MwV)4 z&f@fJ5WMTllozD|X>>imp(PHP7#6o1d?0~usYXdh8zzFh1`e%+SWxwCvNBOdQObji z*o;NHS)jrcMOKmQ_mltuWK%|3drASl_d!t4UKI4S!wlLh`05xqnq#UNN0)^ONg-{= zzi5OVj3&hOC}(;I{og+JiZ{ZyigDL3{*(f>S=?-?GXWifl#^5v2rW;E2@iPp5KJoE zp^m%vZjle~40^(3Qk+#V?$!ysgItTh*!hv@mc zTJopNK@uyrL(AQv{qrEFxX8|AIcDc-m{cFx{_^W#?X?7n1(bq+5V#AciI~K{Zn==v zopfK#h$ULV7q`;CHFMcIpVY(9a31eD*`3x9`*(;_T4n33- zgpKTC$iKIpzL0NK(%K}eVLBv@8INxngszsp9a5z!Cb#wOHRO>ICWy2%1&n`4Ma^(T*5vo6ctWb%Ja)=apsU*%Z`Z;h0W zHaTS}hPg#Y=7@F_6O~>mc8q(lGY;XDr}JS|Pidj(LdFWto%D!39iJE>pGt$*Da(^? zund)ekX&@na=1d(EXkO&D8rJFj9W;!y}Y{p6VS8c=Um_6o=k#@(wUJ3gvy~yBAGTq#O5?6MPi#(POST$NRn3#X9MWW3GQy!^$$Fk*z1Q zfd9H13hSU;fUWx&24-%%-<8Q!U4dAKW*)dN(?)5@H<^0eNR^+2A2$>oQ$EZUv}yw& z_Ja6F!=^&ng&w$8SK?h)@>rqE`9uXM$d(S(t0aq06;;DHT%zE1`~(xH%*x_=mLN`^ z-vC6*r}=?^{9U!V@~eB*?iTaT4VwoOexx-;KtKrL^ZOjCl4oJK4F}|}HSheNq-A$=MU)!nJ zcnvqdsT@2IoA(DVcn~C%){#GT_zU0VydDZn3*Tdtze;0VJlk-Ud0Ycp! zl>>(B!^jRZxLgv)@qLU}NJN!|hMmfO z9X)E2u@;9L{n6!H!#q$d(9GRe*l|1|Q6o5#UTu9FKVD>)X$i_xrZw{z#}wrhBs7ot z@R$Kgl^&Pue7X$bhLl;P3M6>y9xE52=HT92C}ucv;29||F8n- z?2VjFo!ktKENv`ZJQ@D$O3Tp3)X7EP(e>x?@8bD?lED8y*Q)v1P#lfk_vIQ5Nee!Z zOtA%_l_TNjbMmz6iD%>r@yWz9(`(Yb=!j3FW8e;_8f-H^#-wi_Q z$Z-5mkj<{CWPH6q9GNuP>01a+$W{0U(PQkFS`{mTXJ#W`<)d>6j#AUSkIZm#jF=#r zNl`*tq{C&EqYzkJ4w0;Ly=EMI=*w`9QHP~eq+_>6xWyndV?-#`L`=fJa5x?iP*{>J zvlW4~l8mQ~0cNESAc9bIWdk*o{s$zs*dlFY+-T<}^2FSenDbMUMR0jb@U9Qsu zXnF7`a$HbXuW}@r2@oY03?AG$-iv>3^Zai!4*iI|p|?fB`tW(Zx0&0xFx4FAcVxi6 z9?E6<^>pc7m1;G*sWhbz%ZHPwIhtiNdSFfO6C=;Aljehv!t5~2$QK$Ak{H`sAR#0V zaR1iFu8Z;}jpG{uu?ecs_W{*7ES92H%xiry!O&R|P>1f-K$DuhQtcCB^vDplo$B#8 zciLm%|BCk=!%7X%t&yPFI`gRWM@=V5L{q0EiYmRECZ$U$-Km}PAC-2o{ydvg@16vr zp^g?o(~?5vqW`I}(~YKP#3Fsj$r$HRj|qHHv!0@{htY!RU#`$T)=ItlR`b!`?a{3Zxdz#G_Wo`Ny5EtPD+Msc#Ts@9%ZG@*DiD#OT1G_6=d^?;%fV&col zw`1ihky|Ith<=2lkV%cFF_0FBq^e4w=mTjup}TJV7H@^~>Q@mBVC3fr@Bo9OR^q3P zgyS)#u4izXW>tH`Lgh9Z4`>9X)(03sGN{`2^HQbLfUQi%cnmD-9pEQhho+y##krw$ zs?=yI5CYPvS^sF`nFiOc%PMB_5wtXD z*#tFP@@S`bYRpR&uh*sr^$Y&Zr?5C4#WZTN>F?lFKIxr<>kv>mJeC&YLGk|CeKCA@ z*XcN0uzmo@0*&;4rK zYF9IH7zh=lGTl}jF4r4=37fO40WnqbS&3ta%5%b+ABTO?sq%heWGb&EmBm#KZ z(r9e`$tRo$38Ytq?T^irYCUoQW&z2_8~)1wB1q)ed~R#cZ8&JK5(JzP`p7PEhEfTY<;X7v6e%zka`Y=TqFo$u@4DS z^o@idJJhicp=vJt%6nrmaVrd}2ufp7xhLmZIX|sNR(k zRp4<5XSpVESK>o2VmytHf=fwiP_Yc@&!U3K;sC*Q%@;XyNf0U+6Gm8~GcN0aF0g@TTt`qu^aD-b#1kiL zNVjm9(MIjg9rVff(_B4Ie!AcaNfrgW4}6#q-w2{_=fAqTdmGwL)zo*kMSNCyduCCvhgjq(J*T7zuwsxkT77JdpwuJOno~qYE)XY+n zxtvHH|L@?|-{`V7x(0@0{c9P#e{)?XMNON|`#o;Y|IzzJ$FfN)_OeC|a~|kJ{H0uNc|MOqnclF^ z-GS|+xzT-W6W8i;IQ*BTZeGW?V{Wd@B^&>@Ou@|N%~ySgA8@G6vqeG7VX8(82LnS^ ztJhBKK!vI9vJW-+!yY)e`4(PLYWr zb#l!I2$q?m=Md;j_`G322zW=~yn>g=fxAQok)kn3cV_QhzF<#l+?bm9nork6-Kh>- zwx8Y8UuZjr3L?!0g2$K0p;dWNR4eypFP=^SkV&S?EeU!N^X=G|oZR|!LTd|yigBZU zBjc7?e69PM6vyFu@fRsP-3RV@D#v=^jGlS{YE_~rA@|i`tV zSZnrT7r~YVpMxbyC99_99Z=d7ICn!2<05U;*|H#aYk<_LuzcQOuXn~M#3Uu{u;Q{I zwX3!Aj3^;yQ#w!!|F$3D6o(bhf2Vok(-dW27oPYDCF)XA}K(|+?6%R#4@HuCC{7)_I|K` zjk*go5FzPJEYN?2NGi(CTIzr#h9(B8Xw;p=C2Xn4c9HGJ3&{h2{JQXG#3={}DkJ=S zu0bl#!qIN!4|W`XhayIce(Z9K;O-4jRJOXzs_a6_(MUmQO^m(w+hte^wfrH$(|jCq z0h!es`Lt~?a_^u93i3ZbcL_cqQ%zb z-Ni_^5yeQw7gV!^ul)Zz*BJgdHTMRaW!sQ}fMSIIhq>mzgqp_o&MyC74TW3R{%Abm z_AfgA4Ib5uL?aKgH!RdVkk-}}s~Zo-Xf7CTonUiC>M)!rlg%Wv*3c^k5tI{@1N5JR zcp`ViSHFH@0l&F_fyb1^mHgv>D5xjeBpnbJg6W>cRb72uRb5{x`8hNmulL!a5jA|o z7iD{84l!%ox^^;$SvGy9#(&LR05s!S%V5PUct{VTM?5+9;)xD8B^P;ge(x>v$QZpP zP-ynX4)K`y%>PXkrKWsDqVUbB*z*~BUJGeE#WqM^$Og6yP-(r|;|=mR2TWGr zrQ0h7)P>=P3GR~C2_yJZ-j|BgE9v0G9PblFkYE_VG5|Gk>WaU^QM77#wqmzH{lFSV z{W{Hf&gfo#{R}i_MII$n$uscM1`i+PKDxf&=J>zv)k8kR&3-8`ljP=JBv5tFShNy5 zCMJ0{#+rjVVLWDwA-^M?P9aMUp>e9(P{ca20wE5Pfwe zEE^4`cS{Ztg%>Jj--4;Vhg-^IMs1_KWW(-+7iFAdb7J(&%kls$HhvttaQJ6hwx(LoJ= zAITmEl1{959mfK7$4(qfntzFK)&z09<9IX2pikzr*pfS}FPWbG&rMmk!AIynotrsA*XJW|Ll~b#D)dj)V@&W~$ zh<-*ln^55sjHPrR8mA7s#L|4YFE$W z&$4T&2`8^5@h`|h`1Q*pf>RC+*N`k(zYUrUdl!sf(co|(%&|peZW7V+kcQ!Z=rMY5 zhOW4FLMYP^5?rO&!xNB07*ZJn=|&!Xa6Vj63L1yGoAuKI4i+Tj=Yjd$WmU~fH>>Aj zzeG_DWZ!xZ87O``BV!(HE7p{FN#O|0Jj)_BGX!!>AQHt6%DTT9JbWG-=Pu-vI787T z8YQ#1ie6!~48?)$({5EcsUsU(QYf9=CGFC=A$Ild z^(xWw;y^Q7H)bi%uKHu`JSpp14^qvBK#oD!D%&d27_&xx)POMC{da>E&{JzpY?n#< zw_yqD6gl5ZJMjEu_$npWvc5n(duR63MRtX|P?4y}nogYk>6)!##Z~pm>kA3z&s04h z36q4<%aKf*ZY8R-aeHTkG$4U&$_ zcH#ED+h4Bi&Z!0TeckWYfSTrxS5W&uAFT@AnY*I#Y%kzQUGt1)njM(eM|8F?8Bx8h z7!`C5Q{e<4sd(r`WccSyo!ar~-Id}-F^uuaHy?#GSbG8G*X#@BtG)?XC^iDdv5d&* zH{OLbD+L4z3-W|LJxr^lA+>*QA`Ky^X|M*rKv1IX zP)nY@xeNiPUqg`o@`T;(=(~{~%o6Ere4~Di7x#RcAb?AGvw((HM>7FuHICfPEEz|29~al*_`v5yG}=3(OR44qDh3fE4fD9N#QEi=aA7o3 z9Ve1#xqYMgQZ+5+S{!l3py|;cIqndBQx<^&S^DK3t`bi zoWk=JE~)sUh_vl!2QlW#HHQJd&1C{q>K-}Za5|G_101zLu`STQq7zrT0$u1hY}6JN zTC2@uz`Ix^PFKmm3WFX4V0?}TI)80pJR>C_Z;2Lhh@M%RNn<;B`E-3_oHA`9IhIkh zHQ>TCrx8qcKTYu`^%vkHF{&<{3@C?VGC#mBe6Q7{jj0^0oN~3yXkOd##-kGH0nDSj z&{KSl&;$@e6~6fKtM+)qfddDj$Ie%aOa`-kTHtmo%zEb_3v7j9}!8d!6sR$AQ4FC!?;x*g6c@1X}BQ z;{`CkbZ$M$GGuok<(sY*XP&_JsaWm+i(lT3pK2T>Lu!vVl4+ZoN^#bCSWc`h@b45mIJboPx?m|N^i(TrCHDGlh!*~jMXzlFlW5N)=mFV#~=HFd~1r zZk)xeBkBZCR|i8-Y_wxz0V3VT^b*tE3*2I|Zwt+6oQif_IL%#PA0|s0K|#bT$bkX5 zuEh)63s8eWaO{ z;2(I~Z|IKmS)_C7jz6}Q^s5s*KLP1Z4`EM1=>nKq1JHxJMwY3il^p9tz$>^J%Ob|x z2ApXGBTjfb)ceGO&mO}-J-5|>?+h>#U7rKC9Uq1sYjC=}K{YCE?d{!f+i^sv)R0`C zV^gd4Aj;JvE6a`@I0nh{mOP9R=G7@%u>wOG@p$&^gPjDw_FT7ji`}o@d^?zyO^tIl zxMF{I3M$cJJP9NdxmCO-tZuT31A)sN4W_g!4xhu_-ViPG?INcq$O#yav-M9r@q^kU z0kFZ{+`nIb-qafjQX2qXw}(@F;uz&e1)7V&_KJ25&PQ~s9eqXoS=K>cG+-zXDrU12 zt@&RU%XV$&ahk&hX(E{DP6udfY~~HCB-uQ{U`)rC_1@rS4wSY05uXT7p=06OT@sGY z7;RcmS#v`~CXV$RnrQR$3fQgot!gf;c-_@@ZF2Fyuly|k zH2I6_4z)Ya$+FUDAbMMNgfHwn{1cGlPh*pY(Yxs>BGJjMZVI?3uu>g1Am?;|>n&VE zHWhd4(kss@h-W#(N9i_310DpTW%du#ed5tEaUCdn0&RC!;AsMrQ)Ln%)@9eFOPQ4# z$mY-*wu|Kxj3+nXVLIdbL`%D8(+mc<`q9=Q0ekx5!ECfdlr3w9mHP<>DNk-G!_vWb zPyZ6r7tql}FDM9758s*`6+fQnA*-f?vfhKTTn(()I8{^1&ez}HMj>iua7BtT9q?WZ z{KNujCf9a;h|V^1ku!$b)1$l{>a4jzjuhkicG^>%fqVhUJq-M?V9e&nTHS47eJscn z>&UHN8MKkwHc)<@U#?IvmnI;0g+nQzo>e}Bw$e)=tmvB98@AmS_1^aTo8c5%c{JDe z{md|ae{ALC<0!SOD<>yEe$w^$II&f?waUTTL@qlO>#sG5;x=5UeK>Cv-8=j$`+2ta z{ox;VwaiwV;Fuwg7+R;a(JNv>iY~E|g&v~+j6~tls0BH|6S;$MAfDFp7=^le({xYP z?AGh((uqEJvCwSi=SkA(`l5(kptW>G;tefw5tMU_-dS%>QIqs=Bk5A(_xbk>;g4-+ zAa{P}qWSsnU~}4FzO{*R&QnH1)KZ+Kyu=N8Ym2D38oS_CqO6k7Z$MO(?Q88)?$fvVapUTU3AyG|?X9s`0i4 zqO_GKYRJUXcBUF6aMf)L5ZKbkO&LR2N^MnC)x?$8Ik*kFy5u>NMET+RgCj66uTQOk zE!GTPi!*HX?5(0_qs~H4Y2^jMKx%iZ5xg2WoL-uO~PXj%!%H&D&i;3_v(YjiIxv^*1BF2B0U;k69%gZO31O^s z^_$ESEN^r6%pwxrJJ!zZgg5|^(NX>gwa^l7Rddwk0fhJRokkIHaNa{T#u11^A2NV6! z7lZDadP;5o&+sUa!Uv6A(48@b-{qZH+j!hArVASJvbBIYa~@61l?V6~E)Ba7{0Q@= z;~-?@3=`oeV|mLAPeJegajBOgrBN!Jy!EMfi08;RQ&ui@lqfM#Jy z94Q86SGK{N_sE989+ethZU74zRtz!O!K6Y>U0)sK!2wboEXrSkX-&OxnfI3s4MF_i zLk!zH&WfY&VMeZ3VVP+R{iLaBDO&d-wUy#VTbi`jO1UPw!L;3wFF7i-?5MQhgsQ--!$w1b~x_gcMee zsVeH-eTb4Tm7}P{KWf=8#x-6xtgmc<=Gm?O8>phn{YuW(sDkcfT|}N}^Ha-)hvsK2 znvK$^=^k;dRqaS_`3EM5*2_*;U_wYv>%J?<;K0^hqIS%Q z?2nTZ)waoBT+Bqe?AOHmpSB;k)%1WIus~#(6@R(Bl@>8y=a{!&Z(Q8=W^Y_V=51Sv zagw3-6~li|gLZxzc++(qNhLp9?x`PN9@GD1%l#jyXJbPfV^^F1Ja9e1{l5pU_3_Ci zmNc%9_jyW9*%DUQb8*1eOLUk;6uNguT5WzjKdRqBHkf z^~f|z1GlEh&=AHnwZz4ysV15`ZeFO2c&!WG8AIKJFC`R(-y-<24@|+}&!74hG7f2B zNUA+|zxuzO68eAB3Dq5vfW`_hqo?a8Q(%hfR4b;cTcp-CS_U9AHbyR?>5{Mbgr`oR zkavsbd-_W0O)?9Kdxmo7%#y`SC1OcOhs7Tg;HSYvhtis9=BMc&P-mX1W3#TIU>@ry z%2Px1i<=)M5#HwIfP7hRiuvsxF+%kacfPE0(C?d`Db`<++9#N=Wcxr6`{){eI=?{$ z%kLZG)LncS9HC0?H!Y9fWFAB0K_LG8;-qvJw>DOwLc4JFi=Ng|03OAtAUZh;<iGeb1h=M?3C5uTF)XN4r*y8$P7}*Ay4yz#EUDgVCz=Du8n|6EWe?VVk`LqS3 zg{Tipg$b2;gcXxbB+*5CP|BDIQ!EL7?co+`W{xo~q?d^fV}|bUp}(!mtrOBL7*s~R zg65|TTRun==oZ^U;zKjj(@X!B_2W`L`3$)SGgtMAP(*}G3ycd#{tHn_yp5fQHxCk| zbSuiMz@%3qfF0sCDT9WX43(ezmobpV45-96!+<4L3=k_8MW!wIMy4W}R}U4dqtMff z^TR=fZsFVX8Wfwrf=54i{^^jAgk%U>vtqWwxLB)j{+ygy>E3^w4+y_KQYBOI@HukA zVw`#fGm2=pI(}H-3WkZ$IfybvPdxPo7f#W85kv++x*LjB_xEWWG#b%g6=4T}2o|Me z#*rnhnX1l*IQ@CCGF9MywtR0|8M4io9cQ0Zi-LtZd54Q20Ii^SOb9X8g^@kXQ3%%B zj)-Lu*CZ(duZ=WO;!2S-OI@~L5h{>ON8D>!kbquD2(CZEKT(o844k~;>5CP`12+Nb zK68Wt58mxiluE6d081-zX~`u&PqpPbbfb_V?3QsCS#qa)60?c$Lu8fdl*~=30- z7=k957hqv60kmz^uwSw$`*an%D-x(uC_EI2QT*T^piId2 zQ@nZ4N`fJDqM;4goLvbe-4=l{Y-u}?5^qj04g1{_a4*j^MgPo6nZQ;{Hk`)GenNPU zod6D%8^ox{3TZt3=<>HTZp&mwd0`#qQ?`FkqTl0uQU^-?%;y+-ev zuFYNBwr$(CZR=NK*X^!t+qP|U*S2@t>6x5l{vT$NlgyVldDrKa+j zuRh>sc#9i7sjGRNDci3MX%mi$63V!5>{11_uNSnpe@|LLo-3&#G%xY$CyPHrY)BVS zJip=yg>o0H4Le=S*j?~&9ZDz>vWrKuAUP^-t1Fjn!A`(C2-QgWHxtv|>5O?UcL>>F z-iG%N67^##;&1W!!KhmM=p#<)+1{fx2_ZRvFhCDUC@l7yKSkVoEzMZ8X3wJWC|R1Y zV6qz@p!R~7m9H9G;qgyKfzO|XC5j?ivJRHbmY6bF)a%oZiEA9QI9-y%7P5Cs0$Mh0 zvo=H*6TL}NSowe^Jzz)nyMV4Nnud*xO;tiCVT*RU+p;KOp_bpp!A3&0> zu^`<7-T$JKqUk4f^OvU^7Rm5()2O#O9GNDT&tGgXJ0I(Tj!sL&qLI5wap`-Erz^bp zV)CZhS*Eok2jatQLkHvHY(oPFz3Z!1;^R@ea;bqr&ef#@Z2H8 z)Omtqmw4J3Bbtg<)q?&u`zsM?=lf0rpN5Ew-+;hvR!itFFp8ehz>79TL%w(R0(yMm zhw*pSrg+`QdjH-2tF3=+CzCAW!`@Rg!LuID8f`?NJi#*42rDEe=0*wL-C%Y)>>Tve`?u=S!Enl>*cEd$?W2(x zT;jLfq(cK|sh+{N)J6?6&y|~kUNou= zi=HtdNIiCL+(B@`&ge>g!8xX-1ygo(BfJ4-P#qiMHj_hjp_(qBkRwGEHs(^tnZ+Jy zA@|&ZN*8o(*=HTO?+3@zh_V4wJpx%K}R*t-+sOgooV`S2wkZpLme1; z30+Q-F6B_CrxV*|v8p0lZh;QH3Zl7NjY+2L59FOI_VCt?PhDij-n^qF3cL9w3BrbBJ$wyb7}w9lh6`Z4_*gcSKBl zn#t3WtIb)q){#P(9Js+9G-fiY5ISOBnv=xCH?Y2{lbFa2XcR`Bx_Eh%LCWmzbm}c0 zYLaWYWA@MLmm@t?j#5OQsBTs<=Q)lT16fi97;#UlpQ*|6QUm67%?-DyuA zM{T;tDwjQPUoiqMMN5mZdIC3}Q`VRqRt@LwQNgkqZFIAVniQEumfeh0b){&`(4gg? zD>JvcrOQz5HD9)sd%zyovz01!>+0)bwPUT8zH&P#vASUkMMthHx*Cer<(JX*dvo4# zsAnTNwGoS*g||oL*e&Q@<^z^1%xtc++i9Z_QYVCd+l9o!!u{WNT9q=BU}4h z^za`;55iBo@1B1i+xchcdHflAIQ}O?kExN1rLlvNv#EiFnZ23we@{NsIyx>mt!RLg zTEW#nMC*!8rV{u%ESJ&7qrWNYtH#3tjx*b{`UQ>8(c$;7l23a;pJD7`HPf$Xk@YG;rgIH5?D8blsG0j znBVvIrGdZD=E=7#T(%16j~KRGZWpUxsunRHIiQ7R57%fwRvW=xk%oimBF| zZS2&i1?w-8NBE=LZ>tvE{MArZaEgq!KI4SB@?!qr+-`jaX~vV&bhdfS+{QonK?<_w zlyFw!Vweb=QsyM2Ws+mie}k+VJ@p$TcLh;UvjQoJ(TfiI6(8aWxZJADL1NzR5xB$| zuK`WJqZ0)S?L&}B>JaiDZ8c*M!ltdjXC!|qb6u2i4cuQiLboFN%xDV8Nb!q8`YZDH zD7FNLGh_$WxhgEnl3H}cC)iVwC^0jqRN9PV1?pgFt8t*lS*trPU@H$!I4~Y&aor>B zb3tZA$?Rgvjpshd-b0lb@>0;=#@|^ADgTog;1ori!{@t4(o-7@&ruRLkG>5>eY}nT z2q{xLTTv~=1Z!PIizE*e)Oc78bKtSPIVM99rOrHBg^WYT_hXCOnghEa#_`aG0ps$# z$WS6;^PH5%SZ$@hcf<|XNK@jKi;!xDD`j8&#Y^_P9boXQMRj#h94KdmX#kCzXtD~~ z^O;=KkLI3Myo@xT*pEwM9v1jCB zMzpzdyV&Y27_Mr~sxG&!Wx$NB)2khQp^3M*@f;71f*1>?ltbFKM|yp`mo@)Nn>yYJ z^VpELhuuTq3xUA_^GmU=se>x1dwX>hf`GcWVpnhM%L=lo@*~DAjYXH1H(6G&o2(&r z^4@D-5&x^R3pf84wV~SISCI2k43ZivT6BsCb7AyMNRlQ(*B+$j8I#>eni&3p44R&g zs(1W#>n^f+k7-yuC>eVga`4J@4DwyRx!MSnu?t#2P9tiR^#bM>{FEXPFghHMIjDYD zynZCQ?d>V1U*suLzR#&^2tPr7DT$~k>ki1?JCVqnH!FH;=yvdvz6JP3@6D#q?BI*}BRVZm~~ z+X5EPgUP)JzcG0DHf47ChSXk4=d2VTI!wlu3dNnvWf35ai)l?R}VG+E~;AEl>cno1wA`&1H-CM=w z0_kBRLwCA+7sPBEY9IdQn&7P^u$1%Sl8VMMD7V#ZS#+Ko+}h#e;ff8!nfESP$cnQK z%fVqMZ!X+|vtKYEf)eB5pGwu?Q0g>(AJ6PMH0QUmEmH=hYpCn~d8>nGCd zhEe;k(~IB6yM%)kR)Xxlw;I}6-eeqk@gZAVN5o!Q+uPDJdnF{o!{t1<<#-wz2U<%XuN<;wNW zA~gdzTYRls?#Lt_+qQ2D+hDp9vqHQXRgi}*787on=q?h{6%P^*8%y|LFTdzLwV73e zOtQ%7@A+BR`FI8w6oq_6*wG9oBWXn#ECZUP>z7z)?>URsX=~3`sCyKeRY~JwYbo2W zBTQtTH{!iWwCt9HTAAG{CpwnF)+4y8h+)Rh`~U_PugTK1s0Fh%y0O z11(Dwg*ro3ymvTk zZE?d*JQRJ6iyemF-S+KW`@D=oo^B4lOhWuP!FjnFI?-+(ilBHjJgs*C(F`_-rBL1m z+r5LW?>5}h8kc+1g<_>D_>tJJ{TWr7&%4N6>dXo%I`K>umQvpo54(2x@Nt&dtB^5j z=V}g~|0B~j6T!}6rJYB-rpcyLle4JI9n?u|@}xP-#G?7dusY#{(>#~6IjmvfeV^&j z>tz4H0ZV;nefVLYo_nk%PVBeStbl;-q86eXU~0xaT(=G!p2?_F3isZT%m>A8u>(Z?2jkILM%h)CV5gV&zX-)b}kkxr}d zs~zM+l8?jXl^&Mx`kL`T#|p{Y+0#efp0tj6BD51+VGIT(4LL#cz+0-=!%49edkf z6Rsz{g=5yaw>X7~var$sJi@oGTm6S^NUUh;&Njmr5qYduBGfW| z$}81U@TCbma545M-?&Uzye{<@!8ih!+4yNysP-WmcPTX|8kp{XRefNiJx z!{8}TopAE{o__F!CKY*-vNTN&E_eSXO23Q2f!T&aFV`0SwdBjXuNtWIfKc6OA;e;F zVEp@W5&7}+MS7w?5P*YPhmf7q?Z}V^5j%iOnD2`9H>l%-Ti05$LO(+7eL2?dS#cT< zue$u@$s~5mC>E!t1qg}SOP6|0Z}E291hBVwpMF*PGFN}zzlq%eXeiO=o?hIx1Rwg~^@w=U^6}U(@5RsoJ0J#D}eJF2B0&S7sXiV=)Af>O66{+Tow59t7aF zWe@z+-YUD#*q6Y)#sIs9r=}vAGyI$H*HARI`BU59sLD<(B8w`KZPsw3&1dW9?!QrY25LsLEt8}$Wg#SpeH@2*Y z%HSF~4O{jLdq^qx>+&6>hZzJu4+O>xo2yW1q*zljw%9AedguzTS6bZF3WBqqIPM_r zM3rkQ?xWJ6@3^;aDer5c8_Fn?o?*pAZ`lh>C3; z8g@J#b}h+*U)@shpEw8+tLO9K={H;f6_xzf)K|u<^~&(21q*pP7tEC@m%ILMQU%kb=33ybG5>De!S&yxz@@5tSA{+}Rt6;`G4yiJZ?tl|d8Y|M`jyI*IA&QB^2_p|4u?N+W{YQgLwnV?yIG4Yk#j4X-XJv`zR9C+8N?M4!@ss_FNknMjmftinCwf@m zrjgU_g?;|Bp-BC+$;)`Gc^E?q1mq+K1f={w8Dk6#9KB48ei$VN1`PlAtzHvHFIP(k zdpZ^-<{x&6$+oYLBaTGIE)XW~Zw#q;Ldkf{9_Su2>6Fx!CVPBmbR99By-6i z5MWP;cv0L#<(4j=+Yz6VnM{@@n$**Z&a@4;;VoO`eKj()4Jqo4!^A1fEfRNmozm?m zfiIf53AOdp3gx#Bi(6tL?eVKmmw3*!v#);!uBE zMf<~(dUN4up#J&=k()zn3(L&rsjBTC0QaRsgk&&Lqm5Yc+$+Cjd_;x~q2< zWiHd5Sv<*vX@SaXKu)AYH;RMjr?#b=%e~5DP#RF@(rF+`|78`r*d?|cR7gq$YD9Cz zDm0_U`pjAYo^_Glctz*3DU{=ErcSjETe995IK`~aG7d%M(fdMY(i!ADjjY52>pxEd zQ+YsXcY-n2YOk^-$8>@1Un$(cf~|i0O`HikJ0TXi_LTpFSD75*(WpV;P)+@P0h5k$ z4tiwEbx3RKvXxx_0w*HmV8}J1Shy3BiZHFBG{ZBixI-xSW<`#9H%q6|gM^VUydU8| zPVgZ9FQ1S%M8uX9`%&T_+^P-dAlLvd+ex_aYPW7k{y(7!j3eTFGV9;J>i`6+a^S)b zj(8b4rzvTWCz^z2g*}o@AXNDz)!;E z8D%-;R2u{GghY!t%6Ec z6lF+KnOHGJjCtJJV~xT&vGn0tFt5R^VN>NfVKrdM%$JgC3+$3~o5|YXT0{qpe(as-dO(?;VHfM5vt^i*tf4(5SvYCo-b2z~-zovz{dY?%Nd>B>wJPdIuEn@4p#CLF@AN17Pt^Ois#jI`zlQDy ziwJY#M3o$|1N#ZFhig{BkB!W8Nw4R0_2K0*xwof&h1aB1=!~v%fC?Xpzk$Z#3b@-8 z*M9c|kanDfQCwt5gN*1+DtzBiwQ=_CFnhn)60t9W7>N?=AODAbp0-2G$*?sc>xF8I z3#0`4sJsGed~*_NI&Ce6Ddkz+h$o;4@q#Q#;%;>KDR_clrEHXHj3Q@~by6jG7amL} zn>BAzArpBxv;~v0Fjx^TaaBfKY$6zcP0}XIds?AE-Uib8k;HQ?$pysXHq1a*RuSu` zu9Ep>z*!JIR*&BK=J=yFuWHY+Qaf~l1`)LeGWqO2%8$>A~q zx<<<1f#_L*6AMe2aq)V6!TlXgG~t#W8_0LZKz{m;CNP=?Nsts4`(#b>X4t9Vv7j;u z_&@eEpF#$N(kwYhJU9925vv4yqHM?e>J5yj0iI=lCjYOk6DiAE&@TWe!PWw=L!3+3m&hEIKHq$2A47A0-GN&$<z?Ub3@ z=t|sP0-}2_1sz@<1)_dEel^gN-JI`o=9r{&m)_U_FKIimau`)v}FEr?RO zW>0Sx%kX-F;q9xR@*K?kPBN*rv7;$>XDXaG#YCHSN~>*sLlZy4;PTSnkW#V?@?%fS z(l*p0MNZq&PEXZDhYnnZmGNQ4H0ljxRx6N`r)etVtRL{P16X7{MG7F@>O&uzhg*)p zl4uu+B4RR6`9hg@ic`qg0~XdO4zi z-D;mF)?eS{Z+%AD#Y|b?od$cs&7Su`M>xSpiQ}upyZzfTIF0+q$Gfqn+Ww4#c?oMBb`SPu0u8!UEDbm@eMiBCyxw zdV8=}9%7;~oMOgH@2t^S0XUvYQ02%a_VqVqykZtT|vn<%7 zAY>|yC7)WCd%JXwuq|^gza5Me2ReSnmaZ?tL21}POTn}If&g8E`HT}v3Ip-+V_F9; zC-s8qWnv0{ltuH4KPU!b%DEEC7e7FjE!O#ns0qjE8NpxSC5H$_H9}BbB+>z!m=`;+ zwK7WK$RDkyOV^U8JwT0L1aOUZF_n<76Y4u_?_<_J1|B-8O||UBRn^dJge{HydMC0x z)<>05-nI)t8n(X5LTk8tNv_XT;Ow!`-3N#eU!w`R1p5}eT`$t-Sq#fk6f+bT;3gmT zzru12B(on%xiK0^@$_s?IL*cNm489+^1wz}io{PDWQn_#Kz65o>5vx2NC`v+u z8SYsa>}IE6SQ`Ivfci2fWk8pD?@DH!;AQ9HbTX4$>WxTiW_Q)tzrZ9s-1*N;!=(J1T%i-zMT&@YOTpJyq-<#XR-8)$CIuSk=JTb*TmJr#GNnrJz&J2$_ z*=ubfF+`=#Po!a2BYCI=c#UY{40H)v=1hp~GtSh*!6@fbMB8`QM;bSY>>uD6rZ#K2 z{W_jDK#pO34V=U`jh`_{*vpr$2+QZ2Vk--H(AA1%%pz!zZx}0bsnmqH%Mcl2|B-5k zGva8Q`j##ILkl1(Qn>`{AoTf@Et*Q@N>#@SDjT0I4i_^Uq-`#BT;*5*jCITmL<~0x zJDi_~D0aolCkIua$cW;ha3&~@Jf2IWZ^;2Jr12rZ=&BTR0U0te*mZX+f9ic$CYMH_ zzD!!VlPQM@(9r@lFR5og>*Vma$Z_Clxf8E2l4L~a=1W(~FGo;!T0EZwX^3BRU(MyH zrLW;A$v}X97J())4GO-xlx%&hB|z0i2)?ssGlf!iSQ~Z!M;#95Bg%`~{sx@ZM|JWW zib*Nh(5N^Rk2=KY`4T~IFnW` zJLRQ6ePsMzbHG^ACsv1Qi8S+|9!SipPOMfT&;TV)Dp&zj(F~1!^$4xTvdG!KkQ1^kLTBTqnM*9rF-?3MWuHN51p}$VXY}>U&fy zgQa(@{+5jeaR}3S0;6s8FTpUdG@24g(vxHeuVSv?K<#Ogk({Rp zv&LHf4k`P2Uh8=Ofc*HN*oIL;`5pL%G!Os|Rh-3*u_!MMteODlor zxM-gk14QHlPj5^n4l+aV82FHNR6BweL=gof!TQS8MFmbujCC6vlL$!(CtkwE-gI~( zOxT~uL2Ru>j(hu#`OG=7go*8f+U(4XBbr2%t{lnO zu)3ch0@?(dN9_~>_i$&5HM;QvSB!>YPuTJu9^Y3%mhWR)K2;z)u|Q@dYo=!WFREij z?fqcBx&yQovAR)ELqC0vmsO(|$CKw)AsBku;i_inOk(EP=j1v|r?Dt&mV$iYIqc>8 z^qHU#Km@*`JXwCT5OqB46*v`t=*iO6qzKhT9JD9+8qT^|?tK1;pBA!Y| z@kOBvL5|LqY;lx0}zEM#zTJ%SUu@Sw3@iNHe$5 zyPtR^P(oEpDXw;+z!f&R1Vqxeksz0|=yU;{oIyU>pGGhJ)C9U01uqRh6{*Ul%lKxgR>{SnU z!E^0|@p+}9-I5<1;&;V>cab@;S{wzMm2B%7mzqHw$#RN9uB2?@+F;*7_(lUdK}UJ6DW<1sP+N@p#Yz|sG;yoQamoVXdL%Z8q=cBylEk0*@a zTMs3;ep4_B(y=y27D!D}Y*m}x!^qFf@1?C8$U|yf;{~b|=2BVg6^kmNYJx1L$IeTE z-GDy(4-Mf-wjZqJwLxwe$5*$ied%yp=#hqU{4=9(lMH)6tQbevN=L0@A*H4Qpj$%4 z5%8cagh``;9?DF;z0zxsvC&h%?JgTo?0?oDRo+7NH6XKzp^rh|HRHp%FDz;qMLO~t zN#4H(2NyGADog27dKKflDAdniXD2|s7I!&KYXpg=xTwCP=}mlhKbq|#*9P-iZ)55k%G4nC=6u-w$#;HIyqQ0?{^9mxH zFY}_`{cs+qufWxTaz6BL^H>61XMl1YGn6~r5NUSEp?ppj%tla(rjA@8keudu$uzAn zv@wMIZo&)mdP&no#-@e5ay{>JLd*>shOQbE|HmBFv=54=t#UwW7ev9rs|BMWyAfIZ z7@V}iS4_(guiVXGCI(EV$>GGN_xo_Dnj1u+#ZO!9skyo$*2XdO(o*41&09bb-bYgv zdTUi@AA!nTb6G6Ng|v)@@j9;0k|xHgu-pt{k}KbKU2rcAM^6pkM=v#t3%DRI2*s|CfM1iZ z!ZI?3$9ob$N4$~5^Pgkz@e0XB@`i>n=@LcRsnA^U20kMKy^39UB*RA#<)x9=nq0}? z^Os*Tm&veGYyvKPGPrj$GCaAkX-a^>5}a2sE`jN~Bo9E2*|E^r9xr$jD-KNfe77m0 z8JB=s6R+A6ewI(g*`VhYxtN~)gXq;s8zrcDHsVc8YPiVSABda82Ypa|+z)nZc6~KP z+bfyF=ZO;=004x6wZN;(vO03TT=y`paLtl2;s{om37$-BS|4}3g>WLj3)ib(ir&{b z@@_%Z5+U_mNJ7rCV(R#1r}feq*4LA`T}sZ6Ia z%-hyC37Rnvq73vC7eCSV`zZPosSFJ7iAk05@Z=D_nauH_mI-w!7T%1p)MQ!3s7U#h z6>E5E`ppkOeyT&S={9@hbCXJOi%pw&Vnh~~Y~m-B5Mz@s==vM0$hcw$)TLhf{IQPJ zR+YXAiHD&QhWJESX=T)LTQUy_R2nCrlVPiXUi7Lcoo3$a?H{fSsDqEnoE5MkRQ5^d zgb_(!E)bphycLp?0E*C}@y<<@nx1HiSVAbXWb@F7^I+>?t_9gsFg;A5$sEaVkwE|b zPH8vUV2XZG^fXEdH>Faon=Fpg(9VbaN`QwfLu=)MoTc@*?$jFf6Z5$M2PXa)wNY<4 zJ;yw_nBE#%o2eh7?H^6|DYa(>#flb1FEJ5)yaf2*0@52|h+T>F4x~}3mC>Epu~_bB zcTsnwFNeLpZAD{=qLuVI-Y9tEdWy{67xRu9uVs&u*NtHX{qFB`@!UBV^)kNB0y!PM z%@y|kjg*r2#To}&S6low$Dq}|)!r=owuI|*eIGABb35b043Au}@4P~gf5G)45_;5r{I)iQN%!25woSDMCfT5P4@5-%-<29A#pkYOC*%CRcH8{=qW57*9bvm0}D zHWr#|mXCS#Yz`?#KE%In+vb#nNh@5g1n=c~ELvp5i>aJx~s>d&UH=zs!GVE_lw?&41dq%i}-J^Z&or*8C zS5<}eR<#fRyneopB+Yr~m+=`ag|#$Q`ggmVgxm;>=Vt9+FqGhce6Z$qS#NF-YwD<5 zsH}X2aO;`t)%W`Ga=mjsHg`KU>(zPKZ~B1y+-yw4J4^taQAT{>#M?XQz24m&pTZQk zQ#rHQSL|^5XKbZ;HW%GKFVrqp{Wt2$|9(h#jcRtRWZK_oHr8s?q^~eISQC^HOnqPX>KJTyN zwxRy8R@7{1pAFE>*T7L%!vg79@qM)^yoY}G)~(jisQp`X{&d12?(dDe%ik8^T*ooAZe+qQFbTl__R93EYqnW!yz5Q6H^3Cxc8aOT;HWIzrrP?e; zhd`^-srTi-jvip7?Igw3C03|4X1BZ^{_99dj78O-Nr9#bZ~zT#mEIVs?WV@hC(?H? zGCX}eeQI9nX!{r1dxlq_NU?L}^eGU2`TgM~7z>2YVT0W-m%C@+ML9LSAx#4{L)0W- zJFK;CGk&)&56$aODWRGcvznS=QV?nY4vDjS3u?W3iL;4rnMHP9b`?hUpYH?+K#k0% z9KxhSmG0RhvEDpxA@=Wr>lI`uwrY5B76jErT~Y-|>`da*(x@8Qg%xGlDEO0N=o4y^ z!w=kO5pZ;7PHUk9oSTxkr2{*_-(ZtmOv8`axz)0DY#+B?G7qfscAMPQy7lqcw7*7h zh$3d0-u|-lQZ}C-B~qwTS}wyPQaW0eE`x$1f? zA&A!Mmrax%Vi^m-E+Q2G&c%IIX)_EWY8uTY5&ZaKVvz=Tk*9icNkl!rx{Nn!Z7h;3 zwu?4h5Ht$@m7FOHR_(fq>uR!0CcX24u2>_%iNKfNhE)(mqpx!}_#Z~P1pK7Kc>0-1;{_!PE=5_>v925s6O`2uoM3S7pogh+&5 zcg>mExE2h<3+#-rY94xo6v(5O!k1XFNC?48_%|%o8hJ%@c*@HjyiKbAU$wim_yg~D zmgk0}$T+UaQQ1Zc+!dk|%u|||)mO{`j3WN|MzIxM(xKK)tb0T#og&h6t7zlNAA`Fyr125Bo|h9 ztFMJuwf%OcGheUL3`bXAW}N+5?B{1>ji&lC!I{9)LpFnLS6YF*Ew$$=qXYi>LQKK& z5-B9OolVEU`05K3S~oOZO|_lOYTyKRqQrH#;|X{V-WAabydC>}b77a4t&9iEQAXm! zOU+LEgazvmf@fqE|74m1kk6B@gwo+Yt}WOjmyjPo2!{hx7%32) zuNV(Q9*lZW%H~5)rkywM2OooP31=Xpf}VPC|1p=ISh8}w{RhbI`_1dol6}~ErU+)CFFkuXY7%S3jZ8-D;MU(>-mO&=H-kjrF9jx7dS^?IWST(nR9@&< z^xYS#`+%SfB&)etp_s?kN#h4^kdDLyJ~iUq!1K3yN!;^wN72txln@rYOlGkM39!+; zN7=1jns9aFp5qR+3x5u%=p!;&zLtR3Q6R5M)EjyStH-_@~W zqaDUC*!wEhbdBHL2&Ar*--_x2Yj1DCFrr_3Gn9AQiG=!U#5Rp%iHk$Y0>X>hWO6oj zA!B^x&C(f5!!645N$Ni$a0M6?#WW9FcQ?nuP4xZ%({(g6w^8+;ubf0O}u5q)%wtekALTbD!UqK7SNx8=KZ%$W zby!$-siYsP4fW{^nQXPryxMF2?GAzYhK$;p6sWIQfjb!FOapnTAq6s?Issxn?Y3Eh zo%Nm|k>qDf$++66XaFT}JSyuhnKAjzc3^EXqQ#lp!`{#?4kT9iEA~DF`vg#CR=x(c z?Tpug0(!3I=f4ITkKNT$l)peNk2GpGxXn~HJ7m82%6Sq!ewsksFhE~Y4@RF*{~ zL=q1&b5V39TJ$KuR)5PaABQGgOtsi(JIXyU=pXmKX-_=8x7f%#PTxI5`4Xl%hbqV3 zMv~iA8&soLUA4-kQx`M{9FV%I(Kz_lR3{wz+ZU76Hx;`WTgBR@0eQq_9Za9Jx}vlf z(F{CT;IF-s{OR(tmf?tjQLjA(nlj)h&ff;Fzz`q%tHVX?M|y3fq1oQ;<3iqHa1iv1@@C)@M8Y60n{- zv1a5Ifj4VPV862- zlfuA|uB{ZWohnC1=VLWawA{|aR*E7m0ONLeQAxYyqU)MWnSNARVE5`O)nlusv_9fD zseVIg!$7^+<*=TCXND(crt?ClLz8W*WxFefrh!HqH6ixWgL*Exdaa6Cn4@c*Z#17O zqn3($)R9;|lUV))K1sH3pB4@O{pmkD;Dh0bJz}57#~Fqba#WW-;rt|~l`<|KMo7_x zs|5uPS=Ox|TB2$zkMWfi7(?}DqXp(OAk`3WOalR6T&{<&!z`6zS) zz8L*T17p4#THD;ju@S~U4$3V|>6%mp`<&<#^q+P4oD=I& zj6V(-3WyOP{Fv?(D3oVH-VJh1%NrY>x&ITC6+;Gt+AA3)>Et@E6~wAYn1@PBYkG>f z5GuiJnk-sG2}Y^4Ly7R3*}0p5JtK5sST=B~C`{7Qw9Qs9GdUBUUy3=;J2WFPADtr} zSEcFE@8ic>J99efDsP6S&FXEnagYofF@Z#3Z6+H(^AC$rL9`$3pCM#Rh}sr2MdB2^ zzvzaSa8FXxa5%@uEkD7=-^rV+{XOF6?)v15l|6$cLFU9V5X)fR!QwqK=c9U~?h86^ z=kW`~29T{U=!T?k7@Y}3Ikn9a^!}y9mhStyu#Fln>LuK1N}8&X+NNdm>1vg9fUsVR z+y?kU4z16rbr(h_O6SXQHpeOm)hevjiKn<+nN1II(zy#vo#zXQACm;PifDyQr}yHF zO#4_Pj)w}I?{g>E6-ViO%E|od!>4W9KPoknp`w$8OJVNoNaaxAG8T)56 zb&fd9NG>w>o5NKo&{wL^KWlM47nO>Bh4-1n^$k-mzD<)U_dHhm<8XA{&XzU6*`8=l znF=*>v`AWrp9u-(Vizf_m*~-7#`PcFd^T~1zZk29iD;qXy3a`4e+FR=2{5ZB zAbKpNk8+VjV*A%D-ZNQhm}n=cm51k3+85_lf=>cOiozxN>PoHdmf2kbuOR`OyYM+G zl+?_%)X)oQ5q>q!{}Pw{PE^Ik;+uPCb=grmCU%jfL!VMD;q*6&4HW*2=#cpP9_MCB zb-ix8V6mBg>WxH{qr&L?yueHm#qH*#??b9G^GHM4X4sjQp1FgQ9p z{ODsX&D>n*{x`DSg~7wj*vQfGzyIjHeirYv>>M^Ze|_Z`FpcR49xA?(bt24+xCOS1 z8)I1B1eN%F7&$Rg{@~w95~!t)jyl?R0fRdrrs4Ou)Fb>SVV$o9JU1q}DxH4>n&DB%$idF57#~g95V>nb^Mb>K z=gi8P7bsgZ6tV$^w{S1RHwCxlKXPU>z+MWkWEEXWhCs4oNV>L}*z+3)TF*x9bJO>N zrjg9UdGAM|zQgzPs?q(;;9yF{mwn`_8s5Uo+J{H?&1o&`DLlmBGNCY6exdK1K-bjz zLL5M^#}vL-fpUN#gvSO>8~rUr7+J3_8fTXz8AH0EDGG^xG$g5RO{An*I{if0B*Bk> z%N<$ppDt7dBgEqy%V#ljZTQu~)fK=5Fa>dpChx$*tt!?SBxK!%4peITo}I{y!TgDs zTOVs!na0wTH~@go3uiEtDvz`E3)6o` zb0)J(+-_3SE!Ki~2eSH283SIdEG2;&g|n#is*>Z9^&T@WlSh>8Gnxo0QNeZ1R^&z{ zQcPb`3}N+*7dBP2Tics~ZxGcr))wCP)XgSjCb#Fl>D-^81x*2ngsZ@0mK*M ztE`T)jso*0uZMO+uu!Ac@-~{K!v<}rLd|jhrPE%BDw(h#+S7-(x?In4tt$DKQM|gA zdQ>O=afEZosxwDzj~r)5KxPFL87*_BXOgM4J*A%M_NBRm?otM*MMSu0>GpV@uDFgn zw?GLHfG-#!^3IYjj{CPqa^JeAs{)LeQv=L6Z!UluJTLdIKaT^ynX8ovSpGoygHjeg zn5{lNj10+?2g@mmghUyNhvnLQM95)jpsT5TZ6H4tv$x+fN;K#uS+J|AD~c9fp3m_ zSXu3&*PTs0uCq2V^%X8V$fLXjt^IwTAIS4+_#K4HAocUnQkWOZC{rCbmrA*t@@eo8 zAOco*xwD5HWdi)VX`{1ACfmvH65xlV=kY!%Rl6Nuurf8y*x~EAV-kGfq4pFt(L?YH zotOtXXtCPrL>?2RfWwyE34%F7S`#k1qMT=vd)C1+U#1gcM$5ph1JrCKHmr(mv3K&6 z&RKK(5~+_v!Cd`aZ!1Kff=^=V01Y^QJ}|4xG)f}3{eunYHT-~{|=dh6QoyA-K)bOE)pdWptNM{vfaQ8f1-*cqHz zHBAcH&*eVJ!r#lK@FwkTH8mt7cB20C!7G0;d-zIj9rPL( zkWx`9+9qB1QqaUZBHEnl{)D9N(fI><;Yd+;o#cmCev-}FpqDvai4l;l&gI0yTkRT+ zXP+Cpdd&4`-STuGi01KvdpjDjbE~b`ygM^M-Jr^;rNoD*dc`4w{Qkuw?WS6iB5ZmM zyVA*zvt$olmzmc9@W1&Fgz)-L_BJFNF{b=vZ#WeYkkbEW_S(1^n>pK?{RH!W0vHr~W8H5RKf zz3%~CM76Lhj2`3)g1dWgclXAj2^!oj zSb_xC;0{59Cb+u?fb2BSo~CcAr4D8J1OR_DkNz_S=N@et$_+zQ ziX=rj63&*QJjwt9%>iej>Ss(y%>rmd!#RYRmX0u>VIY;7@c>HGw{PE?&xOhY*H9RZ zT&#ace&*6Qi~@%gERwnH_^``tc7W8{>52BOkTMh`v(EJ>GU4!8*+;n9v6s>f_8 zmqD3k?SZ}iI=j~eSI#ODI6TOBqi>Xui&#@yN>CaoqLz@rsKnR`ym>oh;*pMU_Ay9c z`}^zQfN;E0>P6vpWxpJ7Abjt8TzA?nKZ_=n!*U5mrqr1eIU_=fiI$P*WEc4Iv28*S zo&uKx{7fj{Zt89upiQ8V?xS^~qv+lKO!O`#3TPbK*3E%OzA1^(3N(${hsZX@@o~H1 ze%D<`aenkrnDVvlRww}Looc~1ZJBE%u%i$|3gIi}_6Qk<1&5mb1UhF z@=?)ZqbxFQ>?Io8EhS8W%pRhJQG42Gf-Edu1<%8Kti4G+KF4(0^%%S};#K&O=d=*k zNYfl+S7d%XR-3^k$h*Ql7!7aDVQ&Az87RKG?tQW!=1Ai~VJVWWfOBVVm~mOjIiF`X z+CmK@N8OBRr3vm>*A}EtGuKj7W25YLGp+N|E#*~d3r5e4{y+Z%XYpIFqMxe^}4 z)?C*6d1QpSH8|QG4OalQBF=Gep}KI9si#sq7md%zQR&0g56IiBqcHmHU{{s>h>oOO zn3sZAQB7K#8zLGfEiP3R7uc1&iQ-u+hCnN3UJM7vi>H}~pt>HUv|+%>B-nZ69p)tW zpeIv`p^Xu#UUGxz-WPqj*D-K8F?D5iua$L`npwMb9DA%ddadc@V05Xx+@#WVFapO; zla&GlHYCslHgqF*kVZ$>8ePEH1H-x87HBWr`+PtqOU0=qBFfh9u-J6#F8F(@UWHR4 zBqn%G^9t3YLQ^~^{qFW~^GvbdyaF#^@%~LsQO1GQT=$T6gMc6odWsxI;rZFtvx>M7xsPHq9(2_eeuo z^GwCY4lBE!#^5B<{unkT4jW`i;r-rm+|ZcX@u2>|0_ELATU&!ONu}?Lx%I^}6ml6O z0%e+(R6WbkP=S7T1Abjknq_?sU%kbKZ!1X_yZ%-yW8a7@;Nx;9%?e31t<1Ie9f+4j z*2tQhG}_aAi&I;<4aWFh$Yt8vf;;p8FAd9?=ll7ZKQKUz)S}&%;GF0od&%@TzVV7{ zb@gpOPFr?~)mulEc_*TgE`1}~uPg4tGRfwU9aVIOWk9po5epQ+#nu=cad2d<@o<1_ znG(lq)wK@~qYturi@tcB^Vq~?$eUG?bm%3H1h0%LVnNy3TORT4LC9c=ouT*ETOYBt zEaxtlB8srMz~-wJ?902M``#O8Ek3S6%w8Gdht<&5Ez`(3qoU5V+0{pG&EfaL8%1~z z)}gn_lXuaXfi7DxhsNp$=8KO3DC@e|XQGh*hkjGPxUo3(vNQI@mm15rw>yXEmpN2# zA#cNy;d*_tUT3G{>CO*qbs`BU1&z@LuT3w9HT*Cml*!hB=YLJatGl}BoPFHQ>OE|E zy_G&IYyM-Rkl^^+yeG&|<>Oek&0P)I&Ih?P@T(t1P=2pj=Iub)baKc|BhFqj^9*0I z5MS2X!#R3SoTQ?mo#}nah}YKA$zNk1sjcMBNu+nZJ{>q)eS$8QD)7g!f?m!&AGNXr zuk5^|d%LzYNS8c0hT$Stt!6tZZz(KOr44(_b;KE9C&@W??C}Q6hxZ2kj|XrBNPV~hTNh2&3$U`PQN~H522@F1 zW;JczR|}!W5nu)~Lbp09mTNUleiuqkkD2f7-W19}O;GuaVR2o%;C*9zOs<_rdn@JY z!P&Mff6BlV4*Q4h8h$pf+o-*?Li#nWA)S^c%?#tjh@dxuM@38~WS#Cz>MpFpLmYFr z_?>9r#Eo1$d8>~etHU?aJ3~>t3WY=&9Qx1UCx>$Dqry+#SFl0$knBfS7jkcKT36)u zt|rQAwJ_9TJhs$O{RK7`a$Za#9!VBiIO+D-R_1B!%2N;b7qVn>x3E#U$WX0F>QgKb z^S`QwFJ}6(A?2<$IDc@qx*YkLj>sxTn!z)#hx|T_xyiEv)6mjh=G*8P>wv|O)p1Tr z9B{I9kby{xgn=TtKx4LSJC^q=2SV4m{fUz1V#6~&8mT-wvlCD}2_3D6-S-jXGuu2@ z7qO&=0WZQ@{;cSQIOz+xdyQLVTFkUy@jjmd^XEhkDPOB8+d*lgM!@Lld!v%E8nsBA zrGtYE9Os029lVTD#&oOiQ7}h4>S|e1UEgDQU+=smQmpIAKp9TJ$qiCgk6^}N(-EuT z*px?6n};RrS<*r*<}5ksbnem(DwLTAYI2jknUhNSqVJp7s}ahYR_p{FzfkQyzFom3 zK}dZ?;|m*#VL-4<2BtC~E!Kn%<<{yuKc_**B+@>t57w02dj74D(tUl_Nb1)SYfIAduW!{889=>@69oK1Lby>%6AN~TOFmlj13j&n z#me(0Rg$p^8M1@|(WSi)TDrvp4|*t>4B{5ug!foDYh2&_t1R|kV9&x&Du+p@B9=QB zfQc5pRQDCd?+YG>W!8hmn;EtD=O;g;OPcBJUatj&U%tq~+cfCb;YFtruQD&MP!MUw z+KoDx45*zO<;^g1vey1YS`W@WdkA5jmD65Wd17{~s$}uI1oyqW>5zg(iE=vgc`b!q zWqi%gt2x>gv2dA_@-<6Y)5|ZVLzwQ2zf_+4ph44J@bI!4dcWnv07jmAXoE=vSYgC^WYiAWxoRHK0m*jD|#N@l?P8Lki7$1m#ui^_|txrJE4|or7#DBWO-D@ttj0ljmv>Sy?WF8?43N^1Ah* zn(8~Qmnm2dY*CR`$BUK-19Zg|)OiL5ldQ-IJJdL{v|Z4I6f6%0DO^2?L6W8bWw4u+ z1%M+`d{&Hs?(>8AnEox6F2O03IxM(xx{?|6Avv%-g88A7m=7@8w1q&j0Zt?h#~0nM zqY2JxvJ)W7VAscc;;Eg(f1SZf~m&V%7V$lN=oe$CPGxzt;EPoCL1 z`~1tXu~%ca5s61GHYrS7LyPc2Z(9x@7CSNQ>m}>kJ&!~@Isl%jI^aSQF?4U3d)fE2 zg6bW;U`ub@P2^QMyjL~}I!OGRrnW|FG-sN=h9C=EGlK~R1>&1EH0GgI|GhQ7DqcW- z0I7fx6OY%#tWUps-yB<->D!zR&khc>bfVZ8;m4yA@D4iXvf=budGng%+h)7~9o~DR zYR|TfOK`VWC6W{^ccz(*{f>9)8Sx&bnCNo*A`b`+QXDPxaDC0#q}ctuM**!-+oO5| z;Td$G9Y68+gx+ssb!5n+`9K_47o7o5xF7kCkFZ9XEe9;upWEEFi_CnL*j|rpb;I@* zU;h9Llb{18D!sq&v_F76>vu6N5_D~x#cuMKB#`v6TZT70j$SRVicUATZoX376%*n| zKcKi$M(Y6P8h4>MHv`3-jGUqY8}l-!mQS3*m|!U822PnUX4g%q;hyl1*bDxCFL{ER zk5IZI-+n!}IntF+608(EQ~Z{qI7QEC6}c?G?t+G7+bvm_zDoV+)*u>)Aq*)`?l0|9 zCdKtU4GY7@A~D&kxS}8n#0btT$n*B~O(NTIl%Qx5`a*?^0C#tq2%C^+O_m7Ji}B1> zq8Y_T&->9!hfY8X$49Ob$4Ad6Jiq8i9*WPY>MVoM_dzZQJ)o%GZ*QorND&p)O5<4~ zVEL;ihy-uuWxTCgJ1r?b+AMq3(!Q&N7~Q2g!c@#ws^1dh2<}-U>H$hI+i3o|eqK%} z@nX*VxBYzB(p#q-HdM&zP#Y33Tr(M-jY9#r6H25Qi;MEGA-DEN$H@J36Ejp331HH> zfQ#HwXUDWBiLx4oGxM(7kYrM-5ac}n=R}P%@NX8nwA8~A&RDRAKfg>>I$b6Y{vfwm zoKsS}eyWdJq(`F;h1Q|OwI+t~g7A(&gUuf`lX@(3f9x~KX;KoZgQSC~JNjx~hn-lT zyu?7FjK{M5S!i;ngZDEia^ey4S{3kxN*zjtKq$x*duR7r5 zS(g=d(RKSEnL>dCSXCp1CAxl*^LRKrBh;C#O?I0l#&S?wMkQ?zdF(8nMH9#$4G& z59u>4TTrYJrayI)?RlE=a>jWPO)dJd^XAp$iq#qi}e_Zo~9Bw+_b zYy-lMXu=Q-0l*ao6g?3oc=V$(o1qzq%Q4_YYYe!uyEeY_>`hGKSqmFBFv{zP;zUKV zmTeF=zW{ESJUw1qq!X7%e9UC*e@;%(?H0MiW>O$>|MpC>eY&JhJ$n7?(+B}e?*JL! z=WrvSCYT^+MzTrEWzjyhX80v&3qGp!>4QZCC9Bt5Rc+gCxqr{ZIyqGq!*Js^g|I%A z)sqrYNi4kMi*G?Ck9TN|%&{&Vg$?U_+5&G=T0m?cz6I%E!NXgf0(dnL(Hy315weR` z^@IXooM49_ll}QESsM81jU!W3H=xLK=s~* zK3x7;IMDX#SB^cEQe$yHpq}hqgvx#WbzB0LZ>kDH4ax|h@)NE2W`d6w%C#FqQ(ho0 z`Jg9MOxry#)=-;9s$Hm71wkJ`6(t{9UP)4mnjRcfv$FquR;eS{Gv1k9R9ptPM|P3; zV=nnw3T@MES?DU2@YrHv9c2VK-Z6dK{5fVO{sBj2J(fg7OVAvy?28SaCM5&9pxrxG zY#Uh?a%YwJYO+Se;g7fCh_GqNN4{%^RjZ$)f#q!~%Z7ew3V1viHb!4ss0a`gi%N_O z9IbX&T1t`!+iMS3ii-8gsB3l^y(WIVpRWYD#aF_=wcOIS71zdy9Ktcg*0Jiqb~PU@ z@&(sSA!fO-Ryf6mD+GVLm{lS^&YAA$9kYIQzFB%XC6Mbuio24je*uMdyVpAAGF)!e z{qgFJz8kVIagf5twv29=Z5B$O3GTZJy`Y)vbXguc?Ii;99PYCUsVY6X_`aO#aoZ7O zt7&Ac?D-AhT2xumEEs665bv_ZdjZNG?oOdr_h__dpZXq0<)wTBTcrsw;aBDbKe0#M z0F~}OS6a!GqT0U#9=dQ#;ptW_Vi}JpJ?KzdxQyMX(lKVHan(5HU&sXmQp2g@&Ff=k z5=fDXbrBAomr61yM+@F#C{~A#_8dhHAsWfWcN$NQzLdGf6Dw zt|2UVy-6t?Wcyj*tFyFb>+ooxe(ENgxTDMAZg!M-t1(~664N!^u96M&xP&_1)`s|$ zWm@yevdly0=~&O$04bZ2LnW{yihDLc)-hsfInRyy*gEvPFTrqt!#aWerigBjNW0KN zJU3fgV)G;aVjgn5_L7*{{o-ZzAQ!a?mU&?%SG#*^a3nKUC_b;2vd>j@yRW1nLp;f& z|67NC?c8Fo77@WTUss7R_4Qm9w4G;VqmhBWDo7W;$3^p47Ah4w`hA5Z z78s~aXW@@(Sw>n9b5JoP?%glw19l2b9HMMBWREEx$o_axJbiH3vF4Seio$aQ~x&1!U^zkfQV2ZHpW2X)WhXoh_}cOyiay zOIxtB+NzBUXgQbLg<1d^Wy$ITl) zyA()J37R3g(vw3B)g;W>Z_%lLr=Ppu{rq(x^k*!5={I?71j%+m0U$$;a!?SlPsmv8 z;w9?nw=dii6== z+e>;9VwLjlg^=(Dn{$jaU*|38v0LO!GgX0L)jcbi22G#GZ}K7*?lj}=(XtdDK1{nW z;KrBF&mDVJ8?bi0qkkR1q}jcrMj+r9DKbFzsVzW8T+rC_{O)l8WsRYbhk!9u*Rlex zP0L5hX>yaC@Q3@AStBvq*wV*jIjO0BnNA7ce7{Y%dFGDMP?$P)+%nn3s5Eup9r_Up z7fax=98Rs23yC3Roi?iy>7WrZ@Q9=7J=~VlxQ%L;hiXY&G==jO|N{LpY%bAQ^l&?Q+0uQWU*Djl!p$&Il`-{kR!S-1hlq1)6 zOr9GWt$d%t(B~UmvvE1UdmmTVb_|+n>ritfZqPf{RWTE5Tn{!dC9Q!g*KmYQbCp;y zISnJf90=E+n$sDT)=2$;*HPjzG?KJhb2UD-)d^exGTQ`on8xJ=Y$Be#xxTLwS7Y(; zWofX}(XZlrp)@4##K*_)<+@sB6iP|MQ%{}R9CeO&g3rjcle;d4BD}@-w)$g^GMfe> zDzbyCe(KZ?gICo$+Cmj_N_@w$;{#70S1NA1)kN51)H)F}G9xriWSeaJmN-XmxpdMQ zt-EyrHgc;;B@QiVXm6e~DdK7mCX#81N>$6=@>T0c*R|nD&;9#~q&$rA$n^#TaO{~r z;MvET?E2(9GtFO6%FP()7 zJk+6J|GsbLHXqleIDNXhvc^&)%#^|e9){m`2ZD#brzws+Ph~P?Meugz6vo1g?xaH_ zn!u84CvCt>aE{>9I)z4m&uSob#dj8g_mttYUMV8vZ8_Th0r!5vf1$T$e)yBaBipyw z8#e!UzW7Z#F<-;ahP8GZ6 zrm$!NxwOfZuk6CZ6XyKOB`d%TeE_OAg*ngX2_4RCYN3KiF&Z@yyQE?G`^{!cR)hrE zyA|6)i!|DiC7;pd>*Gr@<8ZTmD(}O3N>8QoEZ54zB4Wr086HxJtT5a`yNMf!^N2${ z%HGpC(-=^)Pwm(`Cva#wx|U4~BYhF;`>qqAsuH?`mkt|xCE2~6&Yw<~8X8U#igo!3 zZ*?WWwc=Cj7F)5rvD+L7+9;}6iv4DL+bbWVBsse7%ajP_Hc!%&5?9wt%Cg>xuu8>% zUgdDqvlHY_e&oVbcZx^)jk;IE6wlj(D%&r;b2s^(87s`Bitx2JH5}g+%gkNQ6_H>0 zM4wj&wX80^qxeA4J9HfD-PYpt&PJ_cyv(u3f8~;Z!5(y-|mHPr(7L2re8Lpf69TbH7-g$YJzO z1`pkW(k*KQN0TF78B<0gU=#Yh)|!}^(M;Dqt~tSv=RZA`BL?2&X|d$5R8L|&@d>4+ z1+@6Q#1ByuRU`3QxUYPb(L{$#Xm|jWd}afjj#n8B@QJt>^;p2IzL^O6RNVSL)Mej% z-3yNPszW5*!k>5e4xd5cCA*v9ONK>H*3LvAmTn1;u3{R%)JFQ8xx)G$&18kge`&k+ z7$w_-c1dLGoy-^gob1bfkD|xNPn$^(J<;`B*f`OuCKD_nI201yCi5X~!S(l?Vn zkQL``a{zz>())WW2>R`lBhbkfGDWa;vStRFxmmh;nwmk9wA?|g>I!m-ijY<1pUurc z%n`(=%634^wLsp#DFOf93W9#?`M2i&-485o_O=cVKSP`$a>Jth%fMqS0KnoU03iH3 z0dgz|`YQDwAg-2Hmadjg=8%-N|IJf_cx7y638}&B>}qQ6ZVT!Ei$hWnmOJPIjT*8l zpH2$^Kyd$T1wpuazs7>(#<|*>yBS+py4hMg{RN>cXrSN8GtJrpiHL~+000qC{%0!) zinabVggeB6-@LK*fGq$2g+O9spzpd07v6#lAus^|LjSb?+kmC}{u;r;#Q`!*Ku5>{ z0&{YMWd7L#|LTmijDfz_%fzy1OaS1X4*=l)6RINO*HG?2po6hF(9zk{)zv{|ADbxvQ-+B*zotz|RG$f4&YYA=ZNvKHwj6Ta&NBc2{C@(#T>S#T#?sWm-NqQA&u4CL>}l#?3kf^W z^)DPd$ky8f_rzWh#1U->2Xfy2vlRrvLqq-hyVmc+^uKo#V;2ty!5#E}aF&0?{9UE+ z-#Gn;`iUc|F0_@DhjZ0zZu7cyburpfcK$LkZ=D3@!4e; literal 0 HcmV?d00001 diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..64b1c64 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,125 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:ahmedasmar/devops-claude-skills:monitoring-observability", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "9bb89b1ce889c2df6d7c3c2eedbd6d1301297561", + "treeHash": "9fd50a78a79b6d45553e3372bc2d5142f4c48ba4a945ca724356f89f9ce08825", + "generatedAt": "2025-11-28T10:13:03.403599Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "monitoring-observability", + "description": "Monitoring and observability strategy, metrics/logs/traces systems, SLOs/error budgets, Prometheus/Grafana/Loki, OpenTelemetry, and tool comparison", + "version": null + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "b18b6358cf31ab285b751916a5b2c670b5bc2c8748ef17216f2c9106e4997f8e" + }, + { + "path": "SKILL.md", + "sha256": "c02fcac42ed2d4d6fcda67a9f835000b1a1198734e4d8d18000546dda81402e4" + }, + { + "path": "monitoring-observability.skill", + "sha256": "c2c368577bb73885c887cc824b695fb3d36f4a77e74b2e25dcd7815c331a71c1" + }, + { + "path": "references/alerting_best_practices.md", + "sha256": "99cea7a40310b77a4fdff5543a0b1ee44189497508757bee0dc9ebbe11794a53" + }, + { + "path": "references/metrics_design.md", + "sha256": "6edc73473e9d3c2ac7e46a4d97576d356d177ed701a2468c5e21d528ff9c29d7" + }, + { + "path": "references/tracing_guide.md", + "sha256": "5e419d77a31d8b3ee5c16fb57e1fc6e3e16d31efb8f4a86dd756c7327a482fa0" + }, + { + "path": "references/dql_promql_translation.md", + "sha256": "47113e77b03d9ac70fc35121efd93cf5e17e031b878d27791403493b71058c5c" + }, + { + "path": "references/tool_comparison.md", + "sha256": "fd0fc7e4fc3641ca0ddc469a14fa1373457f5a4586fe4bc7ec23afe3de9f6171" + }, + { + "path": "references/datadog_migration.md", + "sha256": "9ed5e276eb2ea67f72c91e1bb53374b293e164fa28c4c44f31ee9f8660dfaf02" + }, + { + "path": "references/logging_guide.md", + "sha256": "2c94b61d6db2c0f6b8927c8092010f3a2f1ea20d2eefd330d8073e7b4bcf4c9d" + }, + { + "path": "references/slo_sla_guide.md", + "sha256": "2a0cb69dd120897183f7bcab002a368dbe11bd5038817906da3391ca168e0052" + }, + { + "path": "scripts/log_analyzer.py", + "sha256": "c7fb7e13c2d6507c81ee9575fc8514408d36b2f2e786caeb536ba927d517046e" + }, + { + "path": "scripts/analyze_metrics.py", + "sha256": "50ad856cb043dfd70b60c6ca685b526d34b8bc5e5454dd0b530033da3da22545" + }, + { + "path": "scripts/health_check_validator.py", + "sha256": "cef8c447fabf83dfd9bd28a8d22127b87b66aafa4d151cbccd9fe1f1db0bbcf2" + }, + { + "path": "scripts/alert_quality_checker.py", + "sha256": "b561cf9c41e2de8d5f09557c018110553047d0ad54629bdc7a07a654d76263d1" + }, + { + "path": "scripts/datadog_cost_analyzer.py", + "sha256": "05a1c6c0033b04f2f5206af015907f2df4c9cf57f4c2b8f10ba2565236a5c97f" + }, + { + "path": "scripts/slo_calculator.py", + "sha256": "c26ab0f0a31e5efa830a9f24938ec356bfaef927438bd47b95f4ad0015cff662" + }, + { + "path": "scripts/dashboard_generator.py", + "sha256": "6fe98a49ae431d67bc44eb631c542ba29199da72cc348e90ec99d73a05783ee5" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "7b6a16e6bce66bf87929c2f3c4ea32f4bfadd8d9606edd195f144c82ec85f151" + }, + { + "path": "assets/templates/prometheus-alerts/webapp-alerts.yml", + "sha256": "d881081e53650c335ec5cc7d5d96bade03e607e55bff3bcbafe6811377055154" + }, + { + "path": "assets/templates/prometheus-alerts/kubernetes-alerts.yml", + "sha256": "cb8c247b245ea1fb2a904f525fce8f74f9237d79eda04c2c60938135a7271415" + }, + { + "path": "assets/templates/runbooks/incident-runbook-template.md", + "sha256": "1a5ba8951cf5b1408ea2101232ffe8d88fab75ed4ae63b0c9f1902059373112d" + }, + { + "path": "assets/templates/otel-config/collector-config.yaml", + "sha256": "2696548b1c7f4034283cc2387f9730efa4811881d1c9c9219002e7affc8c29f2" + } + ], + "dirSha256": "9fd50a78a79b6d45553e3372bc2d5142f4c48ba4a945ca724356f89f9ce08825" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/alerting_best_practices.md b/references/alerting_best_practices.md new file mode 100644 index 0000000..e4ce496 --- /dev/null +++ b/references/alerting_best_practices.md @@ -0,0 +1,609 @@ +# Alerting Best Practices + +## Core Principles + +### 1. Every Alert Should Be Actionable +If you can't do something about it, don't alert on it. + +❌ Bad: `Alert: CPU > 50%` (What action should be taken?) +✅ Good: `Alert: API latency p95 > 2s for 10m` (Investigate/scale up) + +### 2. Alert on Symptoms, Not Causes +Alert on what users experience, not underlying components. + +❌ Bad: `Database connection pool 80% full` +✅ Good: `Request latency p95 > 1s` (which might be caused by DB pool) + +### 3. Alert on SLO Violations +Tie alerts to Service Level Objectives. + +✅ `Error rate exceeds 0.1% (SLO: 99.9% availability)` + +### 4. Reduce Noise +Alert fatigue is real. Only page for critical issues. + +**Alert Severity Levels**: +- **Critical**: Page on-call immediately (user-facing issue) +- **Warning**: Create ticket, review during business hours +- **Info**: Log for awareness, no action needed + +--- + +## Alert Design Patterns + +### Pattern 1: Multi-Window Multi-Burn-Rate + +Google's recommended SLO alerting approach. + +**Concept**: Alert when error budget burn rate is high enough to exhaust the budget too quickly. + +```yaml +# Fast burn (6% of budget in 1 hour) +- alert: FastBurnRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[1h])) + / + sum(rate(http_requests_total[1h])) + > (14.4 * 0.001) # 14.4x burn rate for 99.9% SLO + for: 2m + labels: + severity: critical + +# Slow burn (6% of budget in 6 hours) +- alert: SlowBurnRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[6h])) + / + sum(rate(http_requests_total[6h])) + > (6 * 0.001) # 6x burn rate for 99.9% SLO + for: 30m + labels: + severity: warning +``` + +**Burn Rate Multipliers for 99.9% SLO (0.1% error budget)**: +- 1 hour window, 2m grace: 14.4x burn rate +- 6 hour window, 30m grace: 6x burn rate +- 3 day window, 6h grace: 1x burn rate + +### Pattern 2: Rate of Change +Alert when metrics change rapidly. + +```yaml +- alert: TrafficSpike + expr: | + sum(rate(http_requests_total[5m])) + > + 1.5 * sum(rate(http_requests_total[5m] offset 1h)) + for: 10m + annotations: + summary: "Traffic increased by 50% compared to 1 hour ago" +``` + +### Pattern 3: Threshold with Hysteresis +Prevent flapping with different thresholds for firing and resolving. + +```yaml +# Fire at 90%, resolve at 70% +- alert: HighCPU + expr: cpu_usage > 90 + for: 5m + +- alert: HighCPU_Resolved + expr: cpu_usage < 70 + for: 5m +``` + +### Pattern 4: Absent Metrics +Alert when expected metrics stop being reported (service down). + +```yaml +- alert: ServiceDown + expr: absent(up{job="my-service"}) + for: 5m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is not reporting metrics" +``` + +### Pattern 5: Aggregate Alerts +Alert on aggregate performance across multiple instances. + +```yaml +- alert: HighOverallErrorRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[5m])) + / + sum(rate(http_requests_total[5m])) + > 0.05 + for: 10m + annotations: + summary: "Overall error rate is {{ $value | humanizePercentage }}" +``` + +--- + +## Alert Annotation Best Practices + +### Required Fields + +**summary**: One-line description of the issue +```yaml +summary: "High error rate on {{ $labels.service }}: {{ $value | humanizePercentage }}" +``` + +**description**: Detailed explanation with context +```yaml +description: | + Error rate on {{ $labels.service }} is {{ $value | humanizePercentage }}, + which exceeds the threshold of 1% for more than 10 minutes. + + Current value: {{ $value }} + Runbook: https://runbooks.example.com/high-error-rate +``` + +**runbook_url**: Link to investigation steps +```yaml +runbook_url: "https://runbooks.example.com/alerts/{{ $labels.alertname }}" +``` + +### Optional but Recommended + +**dashboard**: Link to relevant dashboard +```yaml +dashboard: "https://grafana.example.com/d/service-dashboard?var-service={{ $labels.service }}" +``` + +**logs**: Link to logs +```yaml +logs: "https://kibana.example.com/app/discover#/?_a=(query:(query_string:(query:'service:{{ $labels.service }}')))" +``` + +--- + +## Alert Label Best Practices + +### Required Labels + +**severity**: Critical, warning, or info +```yaml +labels: + severity: critical +``` + +**team**: Who should handle this alert +```yaml +labels: + team: platform + severity: critical +``` + +**component**: What part of the system +```yaml +labels: + component: api-gateway + severity: warning +``` + +### Example Complete Alert +```yaml +- alert: HighLatency + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service) + ) > 1 + for: 10m + labels: + severity: warning + team: backend + component: api + environment: "{{ $labels.environment }}" + annotations: + summary: "High latency on {{ $labels.service }}" + description: | + P95 latency on {{ $labels.service }} is {{ $value }}s, exceeding 1s threshold. + + This may impact user experience. Check recent deployments and database performance. + + Current p95: {{ $value }}s + Threshold: 1s + Duration: 10m+ + runbook_url: "https://runbooks.example.com/high-latency" + dashboard: "https://grafana.example.com/d/api-dashboard" + logs: "https://kibana.example.com/app/discover#/?_a=(query:(query_string:(query:'service:{{ $labels.service }} AND level:error')))" +``` + +--- + +## Alert Thresholds + +### General Guidelines + +**Response Time / Latency**: +- Warning: p95 > 500ms or p99 > 1s +- Critical: p95 > 2s or p99 > 5s + +**Error Rate**: +- Warning: > 1% +- Critical: > 5% + +**Availability**: +- Warning: < 99.9% +- Critical: < 99.5% + +**CPU Utilization**: +- Warning: > 70% for 15m +- Critical: > 90% for 5m + +**Memory Utilization**: +- Warning: > 80% for 15m +- Critical: > 95% for 5m + +**Disk Space**: +- Warning: > 80% full +- Critical: > 90% full + +**Queue Depth**: +- Warning: > 70% of max capacity +- Critical: > 90% of max capacity + +### Application-Specific Thresholds + +Set thresholds based on: +1. **Historical performance**: Use p95 of last 30 days + 20% +2. **SLO requirements**: If SLO is 99.9%, alert at 99.5% +3. **Business impact**: What error rate causes user complaints? + +--- + +## The "for" Clause + +Prevent alert flapping by requiring the condition to be true for a duration. + +### Guidelines + +**Critical alerts**: Short duration (2-5m) +```yaml +- alert: ServiceDown + expr: up == 0 + for: 2m # Quick detection for critical issues +``` + +**Warning alerts**: Longer duration (10-30m) +```yaml +- alert: HighMemoryUsage + expr: memory_usage > 80 + for: 15m # Avoid noise from temporary spikes +``` + +**Resource saturation**: Medium duration (5-10m) +```yaml +- alert: HighCPU + expr: cpu_usage > 90 + for: 5m +``` + +--- + +## Alert Routing + +### Severity-Based Routing + +```yaml +# alertmanager.yml +route: + group_by: ['alertname', 'cluster'] + group_wait: 10s + group_interval: 5m + repeat_interval: 4h + receiver: 'default' + + routes: + # Critical alerts → PagerDuty + - match: + severity: critical + receiver: pagerduty + group_wait: 10s + repeat_interval: 5m + + # Warning alerts → Slack + - match: + severity: warning + receiver: slack + group_wait: 30s + repeat_interval: 12h + + # Info alerts → Email + - match: + severity: info + receiver: email + repeat_interval: 24h +``` + +### Team-Based Routing + +```yaml +routes: + # Platform team + - match: + team: platform + receiver: platform-pagerduty + + # Backend team + - match: + team: backend + receiver: backend-slack + + # Database team + - match: + component: database + receiver: dba-pagerduty +``` + +### Time-Based Routing + +```yaml +# Only page during business hours for non-critical +routes: + - match: + severity: warning + receiver: slack + active_time_intervals: + - business_hours + +time_intervals: + - name: business_hours + time_intervals: + - weekdays: ['monday:friday'] + times: + - start_time: '09:00' + end_time: '17:00' + location: 'America/New_York' +``` + +--- + +## Alert Grouping + +### Intelligent Grouping + +**Group by service and environment**: +```yaml +route: + group_by: ['alertname', 'service', 'environment'] + group_wait: 30s + group_interval: 5m +``` + +This prevents: +- 50 alerts for "HighCPU" on different pods → 1 grouped alert +- Mixing production and staging alerts + +### Inhibition Rules + +Suppress related alerts when a parent alert fires. + +```yaml +inhibit_rules: + # If service is down, suppress latency alerts + - source_match: + alertname: ServiceDown + target_match: + alertname: HighLatency + equal: ['service'] + + # If node is down, suppress all pod alerts on that node + - source_match: + alertname: NodeDown + target_match_re: + alertname: '(PodCrashLoop|HighCPU|HighMemory)' + equal: ['node'] +``` + +--- + +## Runbook Structure + +Every alert should link to a runbook with: + +### 1. Context +- What does this alert mean? +- What is the user impact? +- What is the urgency? + +### 2. Investigation Steps +```markdown +## Investigation + +1. Check service health dashboard + https://grafana.example.com/d/service-dashboard + +2. Check recent deployments + kubectl rollout history deployment/myapp -n production + +3. Check error logs + kubectl logs deployment/myapp -n production --tail=100 | grep ERROR + +4. Check dependencies + - Database: Check slow query log + - Redis: Check memory usage + - External APIs: Check status pages +``` + +### 3. Common Causes +```markdown +## Common Causes + +- **Recent deployment**: Check if alert started after deployment +- **Traffic spike**: Check request rate, might need to scale +- **Database issues**: Check query performance and connection pool +- **External API degradation**: Check third-party status pages +``` + +### 4. Resolution Steps +```markdown +## Resolution + +### Immediate Actions (< 5 minutes) +1. Scale up if traffic spike: `kubectl scale deployment myapp --replicas=10` +2. Rollback if recent deployment: `kubectl rollout undo deployment/myapp` + +### Short-term Actions (< 30 minutes) +1. Restart pods if memory leak: `kubectl rollout restart deployment/myapp` +2. Clear cache if stale data: `redis-cli -h cache.example.com FLUSHDB` + +### Long-term Actions (post-incident) +1. Review and optimize slow queries +2. Implement circuit breakers +3. Add more capacity +4. Update alert thresholds if false positive +``` + +### 5. Escalation +```markdown +## Escalation + +If issue persists after 30 minutes: +- Slack: #backend-oncall +- PagerDuty: Escalate to senior engineer +- Incident Commander: Jane Doe (jane@example.com) +``` + +--- + +## Anti-Patterns to Avoid + +### 1. Alert on Everything +❌ Don't: Alert on every warning log +✅ Do: Alert on error rate exceeding threshold + +### 2. Alert Without Context +❌ Don't: "Error rate high" +✅ Do: "Error rate 5.2% exceeds 1% threshold for 10m, impacting checkout flow" + +### 3. Static Thresholds for Dynamic Systems +❌ Don't: `cpu_usage > 70` (fails during scale-up) +✅ Do: Alert on SLO violations or rate of change + +### 4. No "for" Clause +❌ Don't: Alert immediately on threshold breach +✅ Do: Use `for: 5m` to avoid flapping + +### 5. Too Many Recipients +❌ Don't: Page 10 people for every alert +✅ Do: Route to specific on-call rotation + +### 6. Duplicate Alerts +❌ Don't: Alert on both cause and symptom +✅ Do: Alert on symptom, use inhibition for causes + +### 7. No Runbook +❌ Don't: Alert without guidance +✅ Do: Include runbook_url in every alert + +--- + +## Alert Testing + +### Test Alert Firing +```bash +# Trigger test alert in Prometheus +amtool alert add alertname="TestAlert" \ + severity="warning" \ + summary="Test alert" + +# Or use Alertmanager API +curl -X POST http://alertmanager:9093/api/v1/alerts \ + -d '[{ + "labels": {"alertname": "TestAlert", "severity": "critical"}, + "annotations": {"summary": "Test critical alert"} + }]' +``` + +### Verify Alert Rules +```bash +# Check syntax +promtool check rules alerts.yml + +# Test expression +promtool query instant http://prometheus:9090 \ + 'sum(rate(http_requests_total{status=~"5.."}[5m]))' + +# Unit test alerts +promtool test rules test.yml +``` + +### Test Alertmanager Routing +```bash +# Test which receiver an alert would go to +amtool config routes test \ + --config.file=alertmanager.yml \ + alertname="HighLatency" \ + severity="critical" \ + team="backend" +``` + +--- + +## On-Call Best Practices + +### Rotation Schedule +- **Primary on-call**: First responder +- **Secondary on-call**: Escalation backup +- **Rotation length**: 1 week (balance load vs context) +- **Handoff**: Monday morning (not Friday evening) + +### On-Call Checklist +```markdown +## Pre-shift +- [ ] Test pager/phone +- [ ] Review recent incidents +- [ ] Check upcoming deployments +- [ ] Update contact info + +## During shift +- [ ] Respond to pages within 5 minutes +- [ ] Document all incidents +- [ ] Update runbooks if gaps found +- [ ] Communicate in #incidents channel + +## Post-shift +- [ ] Hand off open incidents +- [ ] Complete incident reports +- [ ] Suggest improvements +- [ ] Update team documentation +``` + +### Escalation Policy +1. **Primary**: Responds within 5 minutes +2. **Secondary**: Auto-escalate after 15 minutes +3. **Manager**: Auto-escalate after 30 minutes +4. **Incident Commander**: Critical incidents only + +--- + +## Metrics About Alerts + +Monitor your monitoring system! + +### Key Metrics +```promql +# Alert firing frequency +sum(ALERTS{alertstate="firing"}) by (alertname) + +# Alert duration +ALERTS_FOR_STATE{alertstate="firing"} + +# Alerts per severity +sum(ALERTS{alertstate="firing"}) by (severity) + +# Time to acknowledge (from PagerDuty/etc) +pagerduty_incident_ack_duration_seconds +``` + +### Alert Quality Metrics +- **Mean Time to Acknowledge (MTTA)**: < 5 minutes +- **Mean Time to Resolve (MTTR)**: < 30 minutes +- **False Positive Rate**: < 10% +- **Alert Coverage**: % of incidents with preceding alert > 80% diff --git a/references/datadog_migration.md b/references/datadog_migration.md new file mode 100644 index 0000000..27fa5b5 --- /dev/null +++ b/references/datadog_migration.md @@ -0,0 +1,649 @@ +# Migrating from Datadog to Open-Source Stack + +## Overview + +This guide helps you migrate from Datadog to a cost-effective open-source observability stack: +- **Metrics**: Datadog → Prometheus + Grafana +- **Logs**: Datadog → Loki + Grafana +- **Traces**: Datadog APM → Tempo/Jaeger + Grafana +- **Dashboards**: Datadog → Grafana +- **Alerts**: Datadog Monitors → Prometheus Alertmanager + +**Estimated Cost Savings**: 60-80% for similar functionality + +--- + +## Cost Comparison + +### Example: 100-host infrastructure + +**Datadog**: +- Infrastructure Pro: $1,500/month (100 hosts × $15) +- Custom Metrics: $50/month (5,000 extra metrics beyond included 10,000) +- Logs: $2,000/month (20GB/day × $0.10/GB × 30 days) +- APM: $3,100/month (100 hosts × $31) +- **Total**: ~$6,650/month ($79,800/year) + +**Open-Source Stack** (self-hosted): +- Infrastructure: $1,200/month (EC2/GKE for Prometheus, Grafana, Loki, Tempo) +- Storage: $300/month (S3/GCS for long-term metrics and traces) +- Operations time: Variable +- **Total**: ~$1,500-2,500/month ($18,000-30,000/year) + +**Savings**: $49,800-61,800/year + +--- + +## Migration Strategy + +### Phase 1: Run Parallel (Month 1-2) +- Deploy open-source stack alongside Datadog +- Migrate metrics first (lowest risk) +- Validate data accuracy +- Build confidence + +### Phase 2: Migrate Dashboards & Alerts (Month 2-3) +- Convert Datadog dashboards to Grafana +- Translate alert rules +- Train team on new tools + +### Phase 3: Migrate Logs & Traces (Month 3-4) +- Set up Loki for log aggregation +- Deploy Tempo/Jaeger for tracing +- Update application instrumentation + +### Phase 4: Decommission Datadog (Month 4-5) +- Confirm all functionality migrated +- Cancel Datadog subscription +- Archive Datadog dashboards/alerts for reference + +--- + +## 1. Metrics Migration (Datadog → Prometheus) + +### Step 1: Deploy Prometheus + +**Kubernetes** (recommended): +```yaml +# prometheus-values.yaml +prometheus: + prometheusSpec: + retention: 30d + storageSpec: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 100Gi + + # Scrape configs + additionalScrapeConfigs: + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod +``` + +**Install**: +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm install prometheus prometheus-community/kube-prometheus-stack -f prometheus-values.yaml +``` + +**Docker Compose**: +```yaml +version: '3' +services: + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=30d' + +volumes: + prometheus-data: +``` + +### Step 2: Replace DogStatsD with Prometheus Exporters + +**Before (DogStatsD)**: +```python +from datadog import statsd + +statsd.increment('page.views') +statsd.histogram('request.duration', 0.5) +statsd.gauge('active_users', 100) +``` + +**After (Prometheus Python client)**: +```python +from prometheus_client import Counter, Histogram, Gauge + +page_views = Counter('page_views_total', 'Page views') +request_duration = Histogram('request_duration_seconds', 'Request duration') +active_users = Gauge('active_users', 'Active users') + +# Usage +page_views.inc() +request_duration.observe(0.5) +active_users.set(100) +``` + +### Step 3: Metric Name Translation + +| Datadog Metric | Prometheus Equivalent | +|----------------|----------------------| +| `system.cpu.idle` | `node_cpu_seconds_total{mode="idle"}` | +| `system.mem.free` | `node_memory_MemFree_bytes` | +| `system.disk.used` | `node_filesystem_size_bytes - node_filesystem_free_bytes` | +| `docker.cpu.usage` | `container_cpu_usage_seconds_total` | +| `kubernetes.pods.running` | `kube_pod_status_phase{phase="Running"}` | + +### Step 4: Export Existing Datadog Metrics (Optional) + +Use Datadog API to export historical data: + +```python +from datadog import api, initialize + +options = { + 'api_key': 'YOUR_API_KEY', + 'app_key': 'YOUR_APP_KEY' +} +initialize(**options) + +# Query metric +result = api.Metric.query( + start=int(time.time() - 86400), # Last 24h + end=int(time.time()), + query='avg:system.cpu.user{*}' +) + +# Convert to Prometheus format and import +``` + +--- + +## 2. Dashboard Migration (Datadog → Grafana) + +### Step 1: Export Datadog Dashboards + +```python +import requests +import json + +api_key = "YOUR_API_KEY" +app_key = "YOUR_APP_KEY" + +headers = { + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key +} + +# Get all dashboards +response = requests.get( + 'https://api.datadoghq.com/api/v1/dashboard', + headers=headers +) + +dashboards = response.json() + +# Export each dashboard +for dashboard in dashboards['dashboards']: + dash_id = dashboard['id'] + detail = requests.get( + f'https://api.datadoghq.com/api/v1/dashboard/{dash_id}', + headers=headers + ).json() + + with open(f'datadog_{dash_id}.json', 'w') as f: + json.dump(detail, f, indent=2) +``` + +### Step 2: Convert to Grafana Format + +**Manual Conversion Template**: + +| Datadog Widget | Grafana Panel Type | +|----------------|-------------------| +| Timeseries | Graph / Time series | +| Query Value | Stat | +| Toplist | Table / Bar gauge | +| Heatmap | Heatmap | +| Distribution | Histogram | + +**Automated Conversion** (basic example): +```python +def convert_datadog_to_grafana(datadog_dashboard): + grafana_dashboard = { + "title": datadog_dashboard['title'], + "panels": [] + } + + for widget in datadog_dashboard['widgets']: + panel = { + "title": widget['definition'].get('title', ''), + "type": map_widget_type(widget['definition']['type']), + "targets": convert_queries(widget['definition']['requests']) + } + grafana_dashboard['panels'].append(panel) + + return grafana_dashboard +``` + +### Step 3: Common Query Translations + +See `dql_promql_translation.md` for comprehensive query mappings. + +**Example conversions**: + +``` +Datadog: avg:system.cpu.user{*} +Prometheus: avg(rate(node_cpu_seconds_total{mode="user"}[5m])) * 100 + +Datadog: sum:requests.count{status:200}.as_rate() +Prometheus: sum(rate(http_requests_total{status="200"}[5m])) + +Datadog: p95:request.duration{*} +Prometheus: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +## 3. Alert Migration (Datadog Monitors → Prometheus Alerts) + +### Step 1: Export Datadog Monitors + +```python +import requests + +api_key = "YOUR_API_KEY" +app_key = "YOUR_APP_KEY" + +headers = { + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key +} + +response = requests.get( + 'https://api.datadoghq.com/api/v1/monitor', + headers=headers +) + +monitors = response.json() + +# Save each monitor +for monitor in monitors: + with open(f'monitor_{monitor["id"]}.json', 'w') as f: + json.dump(monitor, f, indent=2) +``` + +### Step 2: Convert to Prometheus Alert Rules + +**Datadog Monitor**: +```json +{ + "name": "High CPU Usage", + "type": "metric alert", + "query": "avg(last_5m):avg:system.cpu.user{*} > 80", + "message": "CPU usage is high on {{host.name}}" +} +``` + +**Prometheus Alert**: +```yaml +groups: + - name: infrastructure + rules: + - alert: HighCPUUsage + expr: | + 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is {{ $value }}%" +``` + +### Step 3: Alert Routing (Datadog → Alertmanager) + +**Datadog notification channels** → **Alertmanager receivers** + +```yaml +# alertmanager.yml +route: + group_by: ['alertname', 'severity'] + receiver: 'slack-notifications' + +receivers: + - name: 'slack-notifications' + slack_configs: + - api_url: 'YOUR_SLACK_WEBHOOK' + channel: '#alerts' + + - name: 'pagerduty-critical' + pagerduty_configs: + - service_key: 'YOUR_PAGERDUTY_KEY' +``` + +--- + +## 4. Log Migration (Datadog → Loki) + +### Step 1: Deploy Loki + +**Kubernetes**: +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm install loki grafana/loki-stack \ + --set loki.persistence.enabled=true \ + --set loki.persistence.size=100Gi \ + --set promtail.enabled=true +``` + +**Docker Compose**: +```yaml +version: '3' +services: + loki: + image: grafana/loki:latest + ports: + - "3100:3100" + volumes: + - ./loki-config.yaml:/etc/loki/local-config.yaml + - loki-data:/loki + + promtail: + image: grafana/promtail:latest + volumes: + - /var/log:/var/log + - ./promtail-config.yaml:/etc/promtail/config.yml + +volumes: + loki-data: +``` + +### Step 2: Replace Datadog Log Forwarder + +**Before (Datadog Agent)**: +```yaml +# datadog.yaml +logs_enabled: true + +logs_config: + container_collect_all: true +``` + +**After (Promtail)**: +```yaml +# promtail-config.yaml +server: + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: system + static_configs: + - targets: + - localhost + labels: + job: varlogs + __path__: /var/log/*.log +``` + +### Step 3: Query Translation + +**Datadog Logs Query**: +``` +service:my-app status:error +``` + +**Loki LogQL**: +```logql +{job="my-app", level="error"} +``` + +**More examples**: +``` +Datadog: service:api-gateway status:error @http.status_code:>=500 +Loki: {service="api-gateway", level="error"} | json | http_status_code >= 500 + +Datadog: source:nginx "404" +Loki: {source="nginx"} |= "404" +``` + +--- + +## 5. APM Migration (Datadog APM → Tempo/Jaeger) + +### Step 1: Choose Tracing Backend + +- **Tempo**: Better for high volume, cheaper storage (object storage) +- **Jaeger**: More mature, better UI, requires separate storage + +### Step 2: Replace Datadog Tracer with OpenTelemetry + +**Before (Datadog Python)**: +```python +from ddtrace import tracer + +@tracer.wrap() +def my_function(): + pass +``` + +**After (OpenTelemetry)**: +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +# Setup +trace.set_tracer_provider(TracerProvider()) +tracer = trace.get_tracer(__name__) +exporter = OTLPSpanExporter(endpoint="tempo:4317") + +@tracer.start_as_current_span("my_function") +def my_function(): + pass +``` + +### Step 3: Deploy Tempo + +```yaml +# tempo.yaml +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +storage: + trace: + backend: s3 + s3: + bucket: tempo-traces + endpoint: s3.amazonaws.com +``` + +--- + +## 6. Infrastructure Migration + +### Recommended Architecture + +``` +┌─────────────────────────────────────────┐ +│ Grafana (Visualization) │ +│ - Dashboards │ +│ - Unified view │ +└─────────────────────────────────────────┘ + ↓ ↓ ↓ +┌──────────────┐ ┌──────────┐ ┌──────────┐ +│ Prometheus │ │ Loki │ │ Tempo │ +│ (Metrics) │ │ (Logs) │ │ (Traces) │ +└──────────────┘ └──────────┘ └──────────┘ + ↓ ↓ ↓ +┌─────────────────────────────────────────┐ +│ Applications (OpenTelemetry) │ +└─────────────────────────────────────────┘ +``` + +### Sizing Recommendations + +**100-host environment**: + +- **Prometheus**: 2-4 CPU, 8-16GB RAM, 100GB SSD +- **Grafana**: 1 CPU, 2GB RAM +- **Loki**: 2-4 CPU, 8GB RAM, 100GB SSD +- **Tempo**: 2-4 CPU, 8GB RAM, S3 for storage +- **Alertmanager**: 1 CPU, 1GB RAM + +**Total**: ~8-16 CPU, 32-64GB RAM, 200GB SSD + object storage + +--- + +## 7. Migration Checklist + +### Pre-Migration +- [ ] Calculate current Datadog costs +- [ ] Identify all Datadog integrations +- [ ] Export all dashboards +- [ ] Export all monitors +- [ ] Document custom metrics +- [ ] Get stakeholder approval + +### During Migration +- [ ] Deploy Prometheus + Grafana +- [ ] Deploy Loki + Promtail +- [ ] Deploy Tempo/Jaeger (if using APM) +- [ ] Migrate metrics instrumentation +- [ ] Convert dashboards (top 10 critical first) +- [ ] Convert alerts (critical alerts first) +- [ ] Update application logging +- [ ] Replace APM instrumentation +- [ ] Run parallel for 2-4 weeks +- [ ] Validate data accuracy +- [ ] Train team on new tools + +### Post-Migration +- [ ] Decommission Datadog agent from all hosts +- [ ] Cancel Datadog subscription +- [ ] Archive Datadog configs +- [ ] Document new workflows +- [ ] Create runbooks for common tasks + +--- + +## 8. Common Challenges & Solutions + +### Challenge: Missing Datadog Features + +**Datadog Synthetic Monitoring**: +- Solution: Use **Blackbox Exporter** (Prometheus) or **Grafana Synthetic Monitoring** + +**Datadog Network Performance Monitoring**: +- Solution: Use **Cilium Hubble** (Kubernetes) or **eBPF-based tools** + +**Datadog RUM (Real User Monitoring)**: +- Solution: Use **Grafana Faro** or **OpenTelemetry Browser SDK** + +### Challenge: Team Learning Curve + +**Solution**: +- Provide training sessions (2-3 hours per tool) +- Create internal documentation with examples +- Set up sandbox environment for practice +- Assign champions for each tool + +### Challenge: Query Performance + +**Prometheus too slow**: +- Use **Thanos** or **Cortex** for scaling +- Implement recording rules for expensive queries +- Increase retention only where needed + +**Loki too slow**: +- Add more labels for better filtering +- Use chunk caching +- Consider **parallel query execution** + +--- + +## 9. Maintenance Comparison + +### Datadog (Managed) +- **Ops burden**: Low (fully managed) +- **Upgrades**: Automatic +- **Scaling**: Automatic +- **Cost**: High ($6k-10k+/month) + +### Open-Source Stack (Self-hosted) +- **Ops burden**: Medium (requires ops team) +- **Upgrades**: Manual (quarterly) +- **Scaling**: Manual planning required +- **Cost**: Low ($1.5k-3k/month infrastructure) + +**Hybrid Option**: Use **Grafana Cloud** (managed Prometheus/Loki/Tempo) +- Cost: ~$3k/month for 100 hosts +- Ops burden: Low +- Savings: ~50% vs Datadog + +--- + +## 10. ROI Calculation + +### Example Scenario + +**Before (Datadog)**: +- Monthly cost: $7,000 +- Annual cost: $84,000 + +**After (Self-hosted OSS)**: +- Infrastructure: $1,800/month +- Operations (0.5 FTE): $4,000/month +- Annual cost: $69,600 + +**Savings**: $14,400/year + +**After (Grafana Cloud)**: +- Monthly cost: $3,500 +- Annual cost: $42,000 + +**Savings**: $42,000/year (50%) + +**Break-even**: Immediate (no migration costs beyond engineering time) + +--- + +## Resources + +- **Prometheus**: https://prometheus.io/docs/ +- **Grafana**: https://grafana.com/docs/ +- **Loki**: https://grafana.com/docs/loki/ +- **Tempo**: https://grafana.com/docs/tempo/ +- **OpenTelemetry**: https://opentelemetry.io/ +- **Migration Tools**: https://github.com/grafana/dashboard-linter + +--- + +## Support + +If you need help with migration: +- Grafana Labs offers migration consulting +- Many SRE consulting firms specialize in this +- Community support via Slack/Discord channels diff --git a/references/dql_promql_translation.md b/references/dql_promql_translation.md new file mode 100644 index 0000000..1dd3d12 --- /dev/null +++ b/references/dql_promql_translation.md @@ -0,0 +1,756 @@ +# DQL (Datadog Query Language) ↔ PromQL Translation Guide + +## Quick Reference + +| Concept | Datadog (DQL) | Prometheus (PromQL) | +|---------|---------------|---------------------| +| Aggregation | `avg:`, `sum:`, `min:`, `max:` | `avg()`, `sum()`, `min()`, `max()` | +| Rate | `.as_rate()`, `.as_count()` | `rate()`, `increase()` | +| Percentile | `p50:`, `p95:`, `p99:` | `histogram_quantile()` | +| Filtering | `{tag:value}` | `{label="value"}` | +| Time window | `last_5m`, `last_1h` | `[5m]`, `[1h]` | + +--- + +## Basic Queries + +### Simple Metric Query + +**Datadog**: +``` +system.cpu.user +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user"} +``` + +--- + +### Metric with Filter + +**Datadog**: +``` +system.cpu.user{host:web-01} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance="web-01"} +``` + +--- + +### Multiple Filters (AND) + +**Datadog**: +``` +system.cpu.user{host:web-01,env:production} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance="web-01", env="production"} +``` + +--- + +### Wildcard Filters + +**Datadog**: +``` +system.cpu.user{host:web-*} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance=~"web-.*"} +``` + +--- + +### OR Filters + +**Datadog**: +``` +system.cpu.user{host:web-01 OR host:web-02} +``` + +**Prometheus**: +```promql +node_cpu_seconds_total{mode="user", instance=~"web-01|web-02"} +``` + +--- + +## Aggregations + +### Average + +**Datadog**: +``` +avg:system.cpu.user{*} +``` + +**Prometheus**: +```promql +avg(node_cpu_seconds_total{mode="user"}) +``` + +--- + +### Sum + +**Datadog**: +``` +sum:requests.count{*} +``` + +**Prometheus**: +```promql +sum(http_requests_total) +``` + +--- + +### Min/Max + +**Datadog**: +``` +min:system.mem.free{*} +max:system.mem.free{*} +``` + +**Prometheus**: +```promql +min(node_memory_MemFree_bytes) +max(node_memory_MemFree_bytes) +``` + +--- + +### Aggregation by Tag/Label + +**Datadog**: +``` +avg:system.cpu.user{*} by {host} +``` + +**Prometheus**: +```promql +avg by (instance) (node_cpu_seconds_total{mode="user"}) +``` + +--- + +## Rates and Counts + +### Rate (per second) + +**Datadog**: +``` +sum:requests.count{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(http_requests_total[5m])) +``` + +Note: Prometheus requires explicit time window `[5m]` + +--- + +### Count (total over time) + +**Datadog**: +``` +sum:requests.count{*}.as_count() +``` + +**Prometheus**: +```promql +sum(increase(http_requests_total[1h])) +``` + +--- + +### Derivative (change over time) + +**Datadog**: +``` +derivative(avg:system.disk.used{*}) +``` + +**Prometheus**: +```promql +deriv(node_filesystem_size_bytes[5m]) +``` + +--- + +## Percentiles + +### P50 (Median) + +**Datadog**: +``` +p50:request.duration{*} +``` + +**Prometheus** (requires histogram): +```promql +histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +### P95 + +**Datadog**: +``` +p95:request.duration{*} +``` + +**Prometheus**: +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +### P99 + +**Datadog**: +``` +p99:request.duration{*} +``` + +**Prometheus**: +```promql +histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +## Time Windows + +### Last 5 minutes + +**Datadog**: +``` +avg(last_5m):system.cpu.user{*} +``` + +**Prometheus**: +```promql +avg(node_cpu_seconds_total{mode="user"}[5m]) +``` + +--- + +### Last 1 hour + +**Datadog**: +``` +avg(last_1h):system.cpu.user{*} +``` + +**Prometheus**: +```promql +avg_over_time(node_cpu_seconds_total{mode="user"}[1h]) +``` + +--- + +## Math Operations + +### Division + +**Datadog**: +``` +avg:system.mem.used{*} / avg:system.mem.total{*} +``` + +**Prometheus**: +```promql +node_memory_MemUsed_bytes / node_memory_MemTotal_bytes +``` + +--- + +### Multiplication + +**Datadog**: +``` +avg:system.cpu.user{*} * 100 +``` + +**Prometheus**: +```promql +avg(node_cpu_seconds_total{mode="user"}) * 100 +``` + +--- + +### Percentage Calculation + +**Datadog**: +``` +(sum:requests.errors{*} / sum:requests.count{*}) * 100 +``` + +**Prometheus**: +```promql +(sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100 +``` + +--- + +## Common Use Cases + +### CPU Usage Percentage + +**Datadog**: +``` +100 - avg:system.cpu.idle{*} +``` + +**Prometheus**: +```promql +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) +``` + +--- + +### Memory Usage Percentage + +**Datadog**: +``` +(avg:system.mem.used{*} / avg:system.mem.total{*}) * 100 +``` + +**Prometheus**: +```promql +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 +``` + +--- + +### Disk Usage Percentage + +**Datadog**: +``` +(avg:system.disk.used{*} / avg:system.disk.total{*}) * 100 +``` + +**Prometheus**: +```promql +(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 +``` + +--- + +### Request Rate (requests/sec) + +**Datadog**: +``` +sum:requests.count{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(http_requests_total[5m])) +``` + +--- + +### Error Rate Percentage + +**Datadog**: +``` +(sum:requests.errors{*}.as_rate() / sum:requests.count{*}.as_rate()) * 100 +``` + +**Prometheus**: +```promql +(sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100 +``` + +--- + +### Request Latency (P95) + +**Datadog**: +``` +p95:request.duration{*} +``` + +**Prometheus**: +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +### Top 5 Hosts by CPU + +**Datadog**: +``` +top(avg:system.cpu.user{*} by {host}, 5, 'mean', 'desc') +``` + +**Prometheus**: +```promql +topk(5, avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[5m]))) +``` + +--- + +## Functions + +### Absolute Value + +**Datadog**: +``` +abs(diff(avg:system.cpu.user{*})) +``` + +**Prometheus**: +```promql +abs(delta(node_cpu_seconds_total{mode="user"}[5m])) +``` + +--- + +### Ceiling/Floor + +**Datadog**: +``` +ceil(avg:system.cpu.user{*}) +floor(avg:system.cpu.user{*}) +``` + +**Prometheus**: +```promql +ceil(avg(node_cpu_seconds_total{mode="user"})) +floor(avg(node_cpu_seconds_total{mode="user"})) +``` + +--- + +### Clamp (Limit Range) + +**Datadog**: +``` +clamp_min(avg:system.cpu.user{*}, 0) +clamp_max(avg:system.cpu.user{*}, 100) +``` + +**Prometheus**: +```promql +clamp_min(avg(node_cpu_seconds_total{mode="user"}), 0) +clamp_max(avg(node_cpu_seconds_total{mode="user"}), 100) +``` + +--- + +### Moving Average + +**Datadog**: +``` +moving_rollup(avg:system.cpu.user{*}, 60, 'avg') +``` + +**Prometheus**: +```promql +avg_over_time(node_cpu_seconds_total{mode="user"}[1h]) +``` + +--- + +## Advanced Patterns + +### Compare to Previous Period + +**Datadog**: +``` +sum:requests.count{*}.as_rate() / timeshift(sum:requests.count{*}.as_rate(), 3600) +``` + +**Prometheus**: +```promql +sum(rate(http_requests_total[5m])) / sum(rate(http_requests_total[5m] offset 1h)) +``` + +--- + +### Forecast + +**Datadog**: +``` +forecast(avg:system.disk.used{*}, 'linear', 1) +``` + +**Prometheus**: +```promql +predict_linear(node_filesystem_size_bytes[1h], 3600) +``` + +Note: Predicts value 1 hour in future based on last 1 hour trend + +--- + +### Anomaly Detection + +**Datadog**: +``` +anomalies(avg:system.cpu.user{*}, 'basic', 2) +``` + +**Prometheus**: No built-in function +- Use recording rules with stddev +- External tools like **Robust Perception's anomaly detector** +- Or use **Grafana ML** plugin + +--- + +### Outlier Detection + +**Datadog**: +``` +outliers(avg:system.cpu.user{*} by {host}, 'mad') +``` + +**Prometheus**: No built-in function +- Calculate manually with stddev: +```promql +abs(metric - avg(metric)) > 2 * stddev(metric) +``` + +--- + +## Container & Kubernetes + +### Container CPU Usage + +**Datadog**: +``` +avg:docker.cpu.usage{*} by {container_name} +``` + +**Prometheus**: +```promql +avg by (container) (rate(container_cpu_usage_seconds_total[5m])) +``` + +--- + +### Container Memory Usage + +**Datadog**: +``` +avg:docker.mem.rss{*} by {container_name} +``` + +**Prometheus**: +```promql +avg by (container) (container_memory_rss) +``` + +--- + +### Pod Count by Status + +**Datadog**: +``` +sum:kubernetes.pods.running{*} by {kube_namespace} +``` + +**Prometheus**: +```promql +sum by (namespace) (kube_pod_status_phase{phase="Running"}) +``` + +--- + +## Database Queries + +### MySQL Queries Per Second + +**Datadog**: +``` +sum:mysql.performance.queries{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(mysql_global_status_queries[5m])) +``` + +--- + +### PostgreSQL Active Connections + +**Datadog**: +``` +avg:postgresql.connections{*} +``` + +**Prometheus**: +```promql +avg(pg_stat_database_numbackends) +``` + +--- + +### Redis Memory Usage + +**Datadog**: +``` +avg:redis.mem.used{*} +``` + +**Prometheus**: +```promql +avg(redis_memory_used_bytes) +``` + +--- + +## Network Metrics + +### Network Bytes Sent + +**Datadog**: +``` +sum:system.net.bytes_sent{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(node_network_transmit_bytes_total[5m])) +``` + +--- + +### Network Bytes Received + +**Datadog**: +``` +sum:system.net.bytes_rcvd{*}.as_rate() +``` + +**Prometheus**: +```promql +sum(rate(node_network_receive_bytes_total[5m])) +``` + +--- + +## Key Differences + +### 1. Time Windows +- **Datadog**: Optional, defaults to query time range +- **Prometheus**: Always required for rate/increase functions + +### 2. Histograms +- **Datadog**: Percentiles available directly +- **Prometheus**: Requires histogram buckets + `histogram_quantile()` + +### 3. Default Aggregation +- **Datadog**: No default, must specify +- **Prometheus**: Returns all time series unless aggregated + +### 4. Metric Types +- **Datadog**: All metrics treated similarly +- **Prometheus**: Explicit types (counter, gauge, histogram, summary) + +### 5. Tag vs Label +- **Datadog**: Uses "tags" (key:value) +- **Prometheus**: Uses "labels" (key="value") + +--- + +## Migration Tips + +1. **Start with dashboards**: Convert most-used dashboards first +2. **Use recording rules**: Pre-calculate expensive PromQL queries +3. **Test in parallel**: Run both systems during migration +4. **Document mappings**: Create team-specific translation guide +5. **Train team**: PromQL has learning curve, invest in training + +--- + +## Tools + +- **Datadog Dashboard Exporter**: Export JSON dashboards +- **Grafana Dashboard Linter**: Validate converted dashboards +- **PromQL Learning Resources**: https://prometheus.io/docs/prometheus/latest/querying/basics/ + +--- + +## Common Gotchas + +### Rate without Time Window + +❌ **Wrong**: +```promql +rate(http_requests_total) +``` + +✅ **Correct**: +```promql +rate(http_requests_total[5m]) +``` + +--- + +### Aggregating Before Rate + +❌ **Wrong**: +```promql +rate(sum(http_requests_total)[5m]) +``` + +✅ **Correct**: +```promql +sum(rate(http_requests_total[5m])) +``` + +--- + +### Histogram Quantile Without by (le) + +❌ **Wrong**: +```promql +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +``` + +✅ **Correct**: +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +--- + +## Quick Conversion Checklist + +When converting a Datadog query to PromQL: + +- [ ] Replace metric name (e.g., `system.cpu.user` → `node_cpu_seconds_total`) +- [ ] Convert tags to labels (`{tag:value}` → `{label="value"}`) +- [ ] Add time window for rate/increase (`[5m]`) +- [ ] Change aggregation syntax (`avg:` → `avg()`) +- [ ] Convert percentiles to histogram_quantile if needed +- [ ] Test query in Prometheus before adding to dashboard +- [ ] Add `by (label)` for grouped aggregations + +--- + +## Need More Help? + +- See `datadog_migration.md` for full migration guide +- PromQL documentation: https://prometheus.io/docs/prometheus/latest/querying/ +- Practice at: https://demo.promlens.com/ diff --git a/references/logging_guide.md b/references/logging_guide.md new file mode 100644 index 0000000..8b11d44 --- /dev/null +++ b/references/logging_guide.md @@ -0,0 +1,775 @@ +# Logging Guide + +## Structured Logging + +### Why Structured Logs? + +**Unstructured** (text): +``` +2024-10-28 14:32:15 User john@example.com logged in from 192.168.1.1 +``` + +**Structured** (JSON): +```json +{ + "timestamp": "2024-10-28T14:32:15Z", + "level": "info", + "message": "User logged in", + "user": "john@example.com", + "ip": "192.168.1.1", + "event_type": "user_login" +} +``` + +**Benefits**: +- Easy to parse and query +- Consistent format +- Machine-readable +- Efficient storage and indexing + +--- + +## Log Levels + +Use appropriate log levels for better filtering and alerting. + +### DEBUG +**When**: Development, troubleshooting +**Examples**: +- Function entry/exit +- Variable values +- Internal state changes + +```python +logger.debug("Processing request", extra={ + "request_id": req_id, + "params": params +}) +``` + +### INFO +**When**: Important business events +**Examples**: +- User actions (login, purchase) +- System state changes (started, stopped) +- Significant milestones + +```python +logger.info("Order placed", extra={ + "order_id": "12345", + "user_id": "user123", + "amount": 99.99 +}) +``` + +### WARN +**When**: Potentially problematic situations +**Examples**: +- Deprecated API usage +- Slow operations (but not failing) +- Retry attempts +- Resource usage approaching limits + +```python +logger.warning("API response slow", extra={ + "endpoint": "/api/users", + "duration_ms": 2500, + "threshold_ms": 1000 +}) +``` + +### ERROR +**When**: Error conditions that need attention +**Examples**: +- Failed requests +- Exceptions caught and handled +- Integration failures +- Data validation errors + +```python +logger.error("Payment processing failed", extra={ + "order_id": "12345", + "error": str(e), + "payment_gateway": "stripe" +}, exc_info=True) +``` + +### FATAL/CRITICAL +**When**: Severe errors causing shutdown +**Examples**: +- Database connection lost +- Out of memory +- Configuration errors preventing startup + +```python +logger.critical("Database connection lost", extra={ + "database": "postgres", + "host": "db.example.com", + "attempt": 3 +}) +``` + +--- + +## Required Fields + +Every log entry should include: + +### 1. Timestamp +ISO 8601 format with timezone: +```json +{ + "timestamp": "2024-10-28T14:32:15.123Z" +} +``` + +### 2. Level +Standard levels: debug, info, warn, error, critical +```json +{ + "level": "error" +} +``` + +### 3. Message +Human-readable description: +```json +{ + "message": "User authentication failed" +} +``` + +### 4. Service/Application +What component logged this: +```json +{ + "service": "api-gateway", + "version": "1.2.3" +} +``` + +### 5. Environment +```json +{ + "environment": "production" +} +``` + +--- + +## Recommended Fields + +### Request Context +```json +{ + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "user_id": "user123", + "session_id": "sess_abc", + "ip_address": "192.168.1.1", + "user_agent": "Mozilla/5.0..." +} +``` + +### Performance Metrics +```json +{ + "duration_ms": 245, + "response_size_bytes": 1024 +} +``` + +### Error Details +```json +{ + "error_type": "ValidationError", + "error_message": "Invalid email format", + "stack_trace": "...", + "error_code": "VAL_001" +} +``` + +### Business Context +```json +{ + "order_id": "ORD-12345", + "customer_id": "CUST-789", + "transaction_amount": 99.99, + "payment_method": "credit_card" +} +``` + +--- + +## Implementation Examples + +### Python (using structlog) +```python +import structlog + +logger = structlog.get_logger() + +# Configure structured logging +structlog.configure( + processors=[ + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.add_log_level, + structlog.processors.JSONRenderer() + ] +) + +# Usage +logger.info( + "user_logged_in", + user_id="user123", + ip_address="192.168.1.1", + login_method="oauth" +) +``` + +### Node.js (using Winston) +```javascript +const winston = require('winston'); + +const logger = winston.createLogger({ + format: winston.format.json(), + defaultMeta: { service: 'api-gateway' }, + transports: [ + new winston.transports.Console() + ] +}); + +logger.info('User logged in', { + userId: 'user123', + ipAddress: '192.168.1.1', + loginMethod: 'oauth' +}); +``` + +### Go (using zap) +```go +import "go.uber.org/zap" + +logger, _ := zap.NewProduction() +defer logger.Sync() + +logger.Info("User logged in", + zap.String("userId", "user123"), + zap.String("ipAddress", "192.168.1.1"), + zap.String("loginMethod", "oauth"), +) +``` + +### Java (using Logback with JSON) +```java +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import net.logstash.logback.argument.StructuredArguments; + +Logger logger = LoggerFactory.getLogger(MyClass.class); + +logger.info("User logged in", + StructuredArguments.kv("userId", "user123"), + StructuredArguments.kv("ipAddress", "192.168.1.1"), + StructuredArguments.kv("loginMethod", "oauth") +); +``` + +--- + +## Log Aggregation Patterns + +### Pattern 1: ELK Stack (Elasticsearch, Logstash, Kibana) + +**Architecture**: +``` +Application → Filebeat → Logstash → Elasticsearch → Kibana +``` + +**filebeat.yml**: +```yaml +filebeat.inputs: + - type: log + enabled: true + paths: + - /var/log/app/*.log + json.keys_under_root: true + json.add_error_key: true + +output.logstash: + hosts: ["logstash:5044"] +``` + +**logstash.conf**: +``` +input { + beats { + port => 5044 + } +} + +filter { + json { + source => "message" + } + + date { + match => ["timestamp", "ISO8601"] + } + + grok { + match => { "message" => "%{COMBINEDAPACHELOG}" } + } +} + +output { + elasticsearch { + hosts => ["elasticsearch:9200"] + index => "app-logs-%{+YYYY.MM.dd}" + } +} +``` + +### Pattern 2: Loki (Grafana Loki) + +**Architecture**: +``` +Application → Promtail → Loki → Grafana +``` + +**promtail-config.yml**: +```yaml +server: + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: app + static_configs: + - targets: + - localhost + labels: + job: app + __path__: /var/log/app/*.log + pipeline_stages: + - json: + expressions: + level: level + timestamp: timestamp + - labels: + level: + service: + - timestamp: + source: timestamp + format: RFC3339 +``` + +**Query in Grafana**: +```logql +{job="app"} |= "error" | json | level="error" +``` + +### Pattern 3: CloudWatch Logs + +**Install CloudWatch agent**: +```json +{ + "logs": { + "logs_collected": { + "files": { + "collect_list": [ + { + "file_path": "/var/log/app/*.log", + "log_group_name": "/aws/app/production", + "log_stream_name": "{instance_id}", + "timezone": "UTC" + } + ] + } + } + } +} +``` + +**Query with CloudWatch Insights**: +``` +fields @timestamp, level, message, user_id +| filter level = "error" +| sort @timestamp desc +| limit 100 +``` + +### Pattern 4: Fluentd/Fluent Bit + +**fluent-bit.conf**: +``` +[INPUT] + Name tail + Path /var/log/app/*.log + Parser json + Tag app.* + +[FILTER] + Name record_modifier + Match * + Record hostname ${HOSTNAME} + Record cluster production + +[OUTPUT] + Name es + Match * + Host elasticsearch + Port 9200 + Index app-logs + Type _doc +``` + +--- + +## Query Patterns + +### Find Errors in Time Range +**Elasticsearch**: +```json +GET /app-logs-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "level": "error" } }, + { "range": { "@timestamp": { + "gte": "now-1h", + "lte": "now" + }}} + ] + } + } +} +``` + +**Loki (LogQL)**: +```logql +{job="app", level="error"} |= "error" +``` + +**CloudWatch Insights**: +``` +fields @timestamp, @message +| filter level = "error" +| filter @timestamp > ago(1h) +``` + +### Count Errors by Type +**Elasticsearch**: +```json +GET /app-logs-*/_search +{ + "size": 0, + "query": { "match": { "level": "error" } }, + "aggs": { + "error_types": { + "terms": { "field": "error_type.keyword" } + } + } +} +``` + +**Loki**: +```logql +sum by (error_type) (count_over_time({job="app", level="error"}[1h])) +``` + +### Find Slow Requests +**Elasticsearch**: +```json +GET /app-logs-*/_search +{ + "query": { + "range": { "duration_ms": { "gte": 1000 } } + }, + "sort": [ { "duration_ms": "desc" } ] +} +``` + +### Trace Request Through Services +**Elasticsearch** (using request_id): +```json +GET /_search +{ + "query": { + "match": { "request_id": "550e8400-e29b-41d4-a716-446655440000" } + }, + "sort": [ { "@timestamp": "asc" } ] +} +``` + +--- + +## Sampling and Rate Limiting + +### When to Sample +- **High volume services**: > 10,000 logs/second +- **Debug logs in production**: Sample 1-10% +- **Cost optimization**: Reduce storage costs + +### Sampling Strategies + +**1. Random Sampling**: +```python +import random + +if random.random() < 0.1: # Sample 10% + logger.debug("Debug message", ...) +``` + +**2. Rate Limiting**: +```python +from rate_limiter import RateLimiter + +limiter = RateLimiter(max_per_second=100) + +if limiter.allow(): + logger.info("Rate limited log", ...) +``` + +**3. Error-Biased Sampling**: +```python +# Always log errors, sample successful requests +if level == "error" or random.random() < 0.01: + logger.log(level, message, ...) +``` + +**4. Head-Based Sampling** (trace-aware): +```python +# If trace is sampled, log all related logs +if trace_context.is_sampled(): + logger.info("Traced log", trace_id=trace_context.trace_id) +``` + +--- + +## Log Retention + +### Retention Strategy + +**Hot tier** (fast SSD): 7-30 days +- Recent logs +- Full query performance +- High cost + +**Warm tier** (regular disk): 30-90 days +- Older logs +- Slower queries acceptable +- Medium cost + +**Cold tier** (object storage): 90+ days +- Archive logs +- Query via restore +- Low cost + +### Example: Elasticsearch ILM Policy +```json +{ + "policy": { + "phases": { + "hot": { + "actions": { + "rollover": { + "max_size": "50GB", + "max_age": "1d" + } + } + }, + "warm": { + "min_age": "7d", + "actions": { + "allocate": { "number_of_replicas": 1 }, + "shrink": { "number_of_shards": 1 } + } + }, + "cold": { + "min_age": "30d", + "actions": { + "allocate": { "require": { "box_type": "cold" } } + } + }, + "delete": { + "min_age": "90d", + "actions": { + "delete": {} + } + } + } + } +} +``` + +--- + +## Security and Compliance + +### PII Redaction + +**Before logging**: +```python +import re + +def redact_pii(data): + # Redact email + data = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', + '[EMAIL]', data) + # Redact credit card + data = re.sub(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', + '[CARD]', data) + # Redact SSN + data = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', data) + return data + +logger.info("User data", user_input=redact_pii(user_input)) +``` + +**In Logstash**: +``` +filter { + mutate { + gsub => [ + "message", "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "[EMAIL]", + "message", "\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b", "[CARD]" + ] + } +} +``` + +### Access Control + +**Elasticsearch** (with Security): +```yaml +# Role for developers +dev_logs: + indices: + - names: ['app-logs-*'] + privileges: ['read'] + query: '{"match": {"environment": "development"}}' +``` + +**CloudWatch** (IAM Policy): +```json +{ + "Effect": "Allow", + "Action": [ + "logs:DescribeLogGroups", + "logs:GetLogEvents", + "logs:FilterLogEvents" + ], + "Resource": "arn:aws:logs:*:*:log-group:/aws/app/production:*" +} +``` + +--- + +## Common Pitfalls + +### 1. Logging Sensitive Data +❌ `logger.info("Login", password=password)` +✅ `logger.info("Login", user_id=user_id)` + +### 2. Excessive Logging +❌ Logging every iteration of a loop +✅ Log aggregate results or sample + +### 3. Not Including Context +❌ `logger.error("Failed")` +✅ `logger.error("Payment failed", order_id=order_id, error=str(e))` + +### 4. Inconsistent Formats +❌ Mix of JSON and plain text +✅ Pick one format and stick to it + +### 5. No Request IDs +❌ Can't trace request across services +✅ Generate and propagate request_id + +### 6. Logging to Multiple Places +❌ Log to file AND stdout AND syslog +✅ Log to stdout, let agent handle routing + +### 7. Blocking on Log Writes +❌ Synchronous writes to remote systems +✅ Asynchronous buffered writes + +--- + +## Performance Optimization + +### 1. Async Logging +```python +import logging +from logging.handlers import QueueHandler, QueueListener +import queue + +# Create queue +log_queue = queue.Queue() + +# Configure async handler +queue_handler = QueueHandler(log_queue) +logger.addHandler(queue_handler) + +# Process logs in background thread +listener = QueueListener(log_queue, *handlers) +listener.start() +``` + +### 2. Conditional Logging +```python +# Avoid expensive operations if not logging +if logger.isEnabledFor(logging.DEBUG): + logger.debug("Details", data=expensive_serialization(obj)) +``` + +### 3. Batching +```python +# Batch logs before sending +batch = [] +for log in logs: + batch.append(log) + if len(batch) >= 100: + send_to_aggregator(batch) + batch = [] +``` + +### 4. Compression +```yaml +# Filebeat with compression +output.logstash: + hosts: ["logstash:5044"] + compression_level: 3 +``` + +--- + +## Monitoring Log Pipeline + +Track pipeline health with metrics: + +```promql +# Log ingestion rate +rate(logs_ingested_total[5m]) + +# Pipeline lag +log_processing_lag_seconds + +# Dropped logs +rate(logs_dropped_total[5m]) + +# Error parsing rate +rate(logs_parse_errors_total[5m]) +``` + +Alert on: +- Sudden drop in log volume (service down?) +- High parse error rate (format changed?) +- Pipeline lag > 1 minute (capacity issue?) diff --git a/references/metrics_design.md b/references/metrics_design.md new file mode 100644 index 0000000..d915742 --- /dev/null +++ b/references/metrics_design.md @@ -0,0 +1,406 @@ +# Metrics Design Guide + +## The Four Golden Signals + +The Four Golden Signals from Google's SRE book provide a comprehensive view of system health: + +### 1. Latency +**What**: Time to service a request + +**Why Monitor**: Directly impacts user experience + +**Key Metrics**: +- Request duration (p50, p95, p99, p99.9) +- Time to first byte (TTFB) +- Backend processing time +- Database query latency + +**PromQL Examples**: +```promql +# P95 latency +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +# Average latency by endpoint +avg(rate(http_request_duration_seconds_sum[5m])) by (endpoint) + / +avg(rate(http_request_duration_seconds_count[5m])) by (endpoint) +``` + +**Alert Thresholds**: +- Warning: p95 > 500ms +- Critical: p99 > 2s + +### 2. Traffic +**What**: Demand on your system + +**Why Monitor**: Understand load patterns, capacity planning + +**Key Metrics**: +- Requests per second (RPS) +- Transactions per second (TPS) +- Concurrent connections +- Network throughput + +**PromQL Examples**: +```promql +# Requests per second +sum(rate(http_requests_total[5m])) + +# Requests per second by status code +sum(rate(http_requests_total[5m])) by (status) + +# Traffic growth rate (week over week) +sum(rate(http_requests_total[5m])) + / +sum(rate(http_requests_total[5m] offset 7d)) +``` + +**Alert Thresholds**: +- Warning: RPS > 80% of capacity +- Critical: RPS > 95% of capacity + +### 3. Errors +**What**: Rate of requests that fail + +**Why Monitor**: Direct indicator of user-facing problems + +**Key Metrics**: +- Error rate (%) +- 5xx response codes +- Failed transactions +- Exception counts + +**PromQL Examples**: +```promql +# Error rate percentage +sum(rate(http_requests_total{status=~"5.."}[5m])) + / +sum(rate(http_requests_total[5m])) * 100 + +# Error count by type +sum(rate(http_requests_total{status=~"5.."}[5m])) by (status) + +# Application errors +rate(application_errors_total[5m]) +``` + +**Alert Thresholds**: +- Warning: Error rate > 1% +- Critical: Error rate > 5% + +### 4. Saturation +**What**: How "full" your service is + +**Why Monitor**: Predict capacity issues before they impact users + +**Key Metrics**: +- CPU utilization +- Memory utilization +- Disk I/O +- Network bandwidth +- Queue depth +- Thread pool usage + +**PromQL Examples**: +```promql +# CPU saturation +100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Memory saturation +(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 + +# Disk saturation +rate(node_disk_io_time_seconds_total[5m]) * 100 + +# Queue depth +queue_depth_current / queue_depth_max * 100 +``` + +**Alert Thresholds**: +- Warning: > 70% utilization +- Critical: > 90% utilization + +--- + +## RED Method (for Services) + +**R**ate, **E**rrors, **D**uration - a simplified approach for request-driven services + +### Rate +Number of requests per second: +```promql +sum(rate(http_requests_total[5m])) +``` + +### Errors +Number of failed requests per second: +```promql +sum(rate(http_requests_total{status=~"5.."}[5m])) +``` + +### Duration +Time taken to process requests: +```promql +histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` + +**When to Use**: Microservices, APIs, web applications + +--- + +## USE Method (for Resources) + +**U**tilization, **S**aturation, **E**rrors - for infrastructure resources + +### Utilization +Percentage of time resource is busy: +```promql +# CPU utilization +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Disk utilization +(node_filesystem_size_bytes - node_filesystem_avail_bytes) + / node_filesystem_size_bytes * 100 +``` + +### Saturation +Amount of work the resource cannot service (queued): +```promql +# Load average (saturation indicator) +node_load15 + +# Disk I/O wait time +rate(node_disk_io_time_weighted_seconds_total[5m]) +``` + +### Errors +Count of error events: +```promql +# Network errors +rate(node_network_receive_errs_total[5m]) +rate(node_network_transmit_errs_total[5m]) + +# Disk errors +rate(node_disk_io_errors_total[5m]) +``` + +**When to Use**: Servers, databases, network devices + +--- + +## Metric Types + +### Counter +Monotonically increasing value (never decreases) + +**Examples**: Request count, error count, bytes sent + +**Usage**: +```promql +# Always use rate() or increase() with counters +rate(http_requests_total[5m]) # Requests per second +increase(http_requests_total[1h]) # Total requests in 1 hour +``` + +### Gauge +Value that can go up and down + +**Examples**: Memory usage, queue depth, concurrent connections + +**Usage**: +```promql +# Use directly or with aggregations +avg(memory_usage_bytes) +max(queue_depth) +``` + +### Histogram +Samples observations and counts them in configurable buckets + +**Examples**: Request duration, response size + +**Usage**: +```promql +# Calculate percentiles +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +# Average from histogram +rate(http_request_duration_seconds_sum[5m]) + / +rate(http_request_duration_seconds_count[5m]) +``` + +### Summary +Similar to histogram but calculates quantiles on client side + +**Usage**: Less flexible than histograms, avoid for new metrics + +--- + +## Cardinality Best Practices + +**Cardinality**: Number of unique time series + +### High Cardinality Labels (AVOID) +❌ User ID +❌ Email address +❌ IP address +❌ Timestamp +❌ Random IDs + +### Low Cardinality Labels (GOOD) +✅ Environment (prod, staging) +✅ Region (us-east-1, eu-west-1) +✅ Service name +✅ HTTP status code category (2xx, 4xx, 5xx) +✅ Endpoint/route + +### Calculating Cardinality Impact +``` +Time series = unique combinations of labels + +Example: +service (5) × environment (3) × region (4) × status (5) = 300 time series ✅ + +service (5) × environment (3) × region (4) × user_id (1M) = 60M time series ❌ +``` + +--- + +## Naming Conventions + +### Prometheus Naming +``` +___total + +Examples: +http_requests_total +http_request_duration_seconds +process_cpu_seconds_total +node_memory_MemAvailable_bytes +``` + +**Rules**: +- Use snake_case +- Include unit in name (seconds, bytes, ratio) +- Use `_total` suffix for counters +- Namespace by application/component + +### CloudWatch Naming +``` +/ + +Examples: +AWS/EC2/CPUUtilization +MyApp/RequestCount +``` + +**Rules**: +- Use PascalCase +- Group by namespace +- No unit in name (specified separately) + +--- + +## Dashboard Design + +### Key Principles + +1. **Top-Down Layout**: Most important metrics first +2. **Color Coding**: Red (critical), yellow (warning), green (healthy) +3. **Consistent Time Windows**: All panels use same time range +4. **Limit Panels**: 8-12 panels per dashboard maximum +5. **Include Context**: Show related metrics together + +### Dashboard Structure + +``` +┌─────────────────────────────────────────────┐ +│ Overall Health (Single Stats) │ +│ [Requests/s] [Error%] [P95 Latency] │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Request Rate & Errors (Graphs) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Latency Distribution (Graphs) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Resource Usage (Graphs) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Dependencies (Graphs) │ +└─────────────────────────────────────────────┘ +``` + +### Template Variables +Use variables for filtering: +- Environment: `$environment` +- Service: `$service` +- Region: `$region` +- Pod: `$pod` + +--- + +## Common Pitfalls + +### 1. Monitoring What You Build, Not What Users Experience +❌ `backend_processing_complete` +✅ `user_request_completed` + +### 2. Too Many Metrics +- Start with Four Golden Signals +- Add metrics only when needed for specific issues +- Remove unused metrics + +### 3. Incorrect Aggregations +❌ `avg(rate(...))` - averages rates incorrectly +✅ `sum(rate(...)) / count(...)` - correct average + +### 4. Wrong Time Windows +- Too short (< 1m): Noisy data +- Too long (> 15m): Miss short-lived issues +- Sweet spot: 5m for most alerts + +### 5. Missing Labels +❌ `http_requests_total` +✅ `http_requests_total{method="GET", status="200", endpoint="/api/users"}` + +--- + +## Metric Collection Best Practices + +### Application Instrumentation +```python +from prometheus_client import Counter, Histogram, Gauge + +# Counter for requests +requests_total = Counter('http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status']) + +# Histogram for latency +request_duration = Histogram('http_request_duration_seconds', + 'HTTP request duration', + ['method', 'endpoint']) + +# Gauge for in-progress requests +requests_in_progress = Gauge('http_requests_in_progress', + 'HTTP requests currently being processed') +``` + +### Collection Intervals +- Application metrics: 15-30s +- Infrastructure metrics: 30-60s +- Billing/cost metrics: 5-15m +- External API checks: 1-5m + +### Retention +- Raw metrics: 15-30 days +- 5m aggregates: 90 days +- 1h aggregates: 1 year +- Daily aggregates: 2+ years diff --git a/references/slo_sla_guide.md b/references/slo_sla_guide.md new file mode 100644 index 0000000..3704fd9 --- /dev/null +++ b/references/slo_sla_guide.md @@ -0,0 +1,652 @@ +# SLI, SLO, and SLA Guide + +## Definitions + +### SLI (Service Level Indicator) +**What**: A quantitative measure of service quality + +**Examples**: +- Request latency (ms) +- Error rate (%) +- Availability (%) +- Throughput (requests/sec) + +### SLO (Service Level Objective) +**What**: Target value or range for an SLI + +**Examples**: +- "99.9% of requests return in < 500ms" +- "99.95% availability" +- "Error rate < 0.1%" + +### SLA (Service Level Agreement) +**What**: Business contract with consequences for SLO violations + +**Examples**: +- "99.9% uptime or 10% monthly credit" +- "p95 latency < 1s or refund" + +### Relationship +``` +SLI = Measurement +SLO = Target (internal goal) +SLA = Promise (customer contract with penalties) + +Example: +SLI: Actual availability this month = 99.92% +SLO: Target availability = 99.9% +SLA: Guaranteed availability = 99.5% (with penalties) +``` + +--- + +## Choosing SLIs + +### The Four Golden Signals as SLIs + +1. **Latency SLIs** + - Request duration (p50, p95, p99) + - Time to first byte + - Page load time + +2. **Availability/Success SLIs** + - % of successful requests + - % uptime + - % of requests completing + +3. **Throughput SLIs** (less common) + - Requests per second + - Transactions per second + +4. **Saturation SLIs** (internal only) + - Resource utilization + - Queue depth + +### SLI Selection Criteria + +✅ **Good SLIs**: +- Measured from user perspective +- Directly impact user experience +- Aggregatable across instances +- Proportional to user happiness + +❌ **Bad SLIs**: +- Internal metrics only +- Not user-facing +- Hard to measure consistently + +### Examples by Service Type + +**Web Application**: +``` +SLI 1: Request Success Rate + = successful_requests / total_requests + +SLI 2: Request Latency (p95) + = 95th percentile of response times + +SLI 3: Availability + = time_service_responding / total_time +``` + +**API Service**: +``` +SLI 1: Error Rate + = (4xx_errors + 5xx_errors) / total_requests + +SLI 2: Response Time (p99) + = 99th percentile latency + +SLI 3: Throughput + = requests_per_second +``` + +**Batch Processing**: +``` +SLI 1: Job Success Rate + = successful_jobs / total_jobs + +SLI 2: Processing Latency + = time_from_submission_to_completion + +SLI 3: Freshness + = age_of_oldest_unprocessed_item +``` + +**Storage Service**: +``` +SLI 1: Durability + = data_not_lost / total_data + +SLI 2: Read Latency (p99) + = 99th percentile read time + +SLI 3: Write Success Rate + = successful_writes / total_writes +``` + +--- + +## Setting SLO Targets + +### Start with Current Performance + +1. **Measure baseline**: Collect 30 days of data +2. **Analyze distribution**: Look at p50, p95, p99, p99.9 +3. **Set initial SLO**: Slightly better than worst performer +4. **Iterate**: Tighten or loosen based on feasibility + +### Example Process + +**Current Performance** (30 days): +``` +p50 latency: 120ms +p95 latency: 450ms +p99 latency: 1200ms +p99.9 latency: 3500ms + +Error rate: 0.05% +Availability: 99.95% +``` + +**Initial SLOs**: +``` +Latency: p95 < 500ms (slightly worse than current p95) +Error rate: < 0.1% (double current rate) +Availability: 99.9% (slightly worse than current) +``` + +**Rationale**: Start loose, prevent false alarms, tighten over time + +### Common SLO Targets + +**Availability**: +- **99%** (3.65 days downtime/year): Internal tools +- **99.5%** (1.83 days/year): Non-critical services +- **99.9%** (8.76 hours/year): Standard production +- **99.95%** (4.38 hours/year): Critical services +- **99.99%** (52 minutes/year): High availability +- **99.999%** (5 minutes/year): Mission critical + +**Latency**: +- **p50 < 100ms**: Excellent responsiveness +- **p95 < 500ms**: Standard web applications +- **p99 < 1s**: Acceptable for most users +- **p99.9 < 5s**: Acceptable for rare edge cases + +**Error Rate**: +- **< 0.01%** (99.99% success): Critical operations +- **< 0.1%** (99.9% success): Standard production +- **< 1%** (99% success): Non-critical services + +--- + +## Error Budgets + +### Concept + +Error budget = (100% - SLO target) + +If SLO is 99.9%, error budget is 0.1% + +**Purpose**: Balance reliability with feature velocity + +### Calculation + +**For availability**: +``` +Monthly error budget = (1 - SLO) × time_period + +Example (99.9% SLO, 30 days): +Error budget = 0.001 × 30 days = 0.03 days = 43.2 minutes +``` + +**For request-based SLIs**: +``` +Error budget = (1 - SLO) × total_requests + +Example (99.9% SLO, 10M requests/month): +Error budget = 0.001 × 10,000,000 = 10,000 failed requests +``` + +### Error Budget Consumption + +**Formula**: +``` +Budget consumed = actual_errors / allowed_errors × 100% + +Example: +SLO: 99.9% (0.1% error budget) +Total requests: 1,000,000 +Failed requests: 500 +Allowed failures: 1,000 + +Budget consumed = 500 / 1,000 × 100% = 50% +Budget remaining = 50% +``` + +### Error Budget Policy + +**Example policy**: + +```markdown +## Error Budget Policy + +### If error budget > 50% +- Deploy frequently (multiple times per day) +- Take calculated risks +- Experiment with new features +- Acceptable to have some incidents + +### If error budget 20-50% +- Deploy normally (once per day) +- Increase testing +- Review recent changes +- Monitor closely + +### If error budget < 20% +- Freeze non-critical deploys +- Focus on reliability improvements +- Postmortem all incidents +- Reduce change velocity + +### If error budget exhausted (< 0%) +- Complete deploy freeze except rollbacks +- All hands on reliability +- Mandatory postmortems +- Executive escalation +``` + +--- + +## Error Budget Burn Rate + +### Concept + +Burn rate = rate of error budget consumption + +**Example**: +- Monthly budget: 43.2 minutes (99.9% SLO) +- If consuming at 2x rate: Budget exhausted in 15 days +- If consuming at 10x rate: Budget exhausted in 3 days + +### Burn Rate Calculation + +``` +Burn rate = (actual_error_rate / allowed_error_rate) + +Example: +SLO: 99.9% (0.1% allowed error rate) +Current error rate: 0.5% + +Burn rate = 0.5% / 0.1% = 5x +Time to exhaust = 30 days / 5 = 6 days +``` + +### Multi-Window Alerting + +Alert on burn rate across multiple time windows: + +**Fast burn** (1 hour window): +``` +Burn rate > 14.4x → Exhausts budget in 2 days +Alert after 2 minutes +Severity: Critical (page immediately) +``` + +**Moderate burn** (6 hour window): +``` +Burn rate > 6x → Exhausts budget in 5 days +Alert after 30 minutes +Severity: Warning (create ticket) +``` + +**Slow burn** (3 day window): +``` +Burn rate > 1x → Exhausts budget by end of month +Alert after 6 hours +Severity: Info (monitor) +``` + +### Implementation + +**Prometheus**: +```yaml +# Fast burn alert (1h window, 2m grace period) +- alert: ErrorBudgetFastBurn + expr: | + ( + sum(rate(http_requests_total{status=~"5.."}[1h])) + / + sum(rate(http_requests_total[1h])) + ) > (14.4 * 0.001) # 14.4x burn rate for 99.9% SLO + for: 2m + labels: + severity: critical + annotations: + summary: "Fast error budget burn detected" + description: "Error budget will be exhausted in 2 days at current rate" + +# Slow burn alert (6h window, 30m grace period) +- alert: ErrorBudgetSlowBurn + expr: | + ( + sum(rate(http_requests_total{status=~"5.."}[6h])) + / + sum(rate(http_requests_total[6h])) + ) > (6 * 0.001) # 6x burn rate for 99.9% SLO + for: 30m + labels: + severity: warning + annotations: + summary: "Elevated error budget burn detected" +``` + +--- + +## SLO Reporting + +### Dashboard Structure + +**Overall Health**: +``` +┌─────────────────────────────────────────┐ +│ SLO Compliance: 99.92% ✅ │ +│ Error Budget Remaining: 73% 🟢 │ +│ Burn Rate: 0.8x 🟢 │ +└─────────────────────────────────────────┘ +``` + +**SLI Performance**: +``` +Latency p95: 420ms (Target: 500ms) ✅ +Error Rate: 0.08% (Target: < 0.1%) ✅ +Availability: 99.95% (Target: > 99.9%) ✅ +``` + +**Error Budget Trend**: +``` +Graph showing: +- Error budget consumption over time +- Burn rate spikes +- Incidents marked +- Deploy events overlaid +``` + +### Monthly SLO Report + +**Template**: +```markdown +# SLO Report: October 2024 + +## Executive Summary +- ✅ All SLOs met this month +- 🟡 Latency SLO came close to violation (99.1% compliance) +- 3 incidents consumed 47% of error budget +- Error budget remaining: 53% + +## SLO Performance + +### Availability SLO: 99.9% +- Actual: 99.92% +- Status: ✅ Met +- Error budget consumed: 33% +- Downtime: 23 minutes (allowed: 43.2 minutes) + +### Latency SLO: p95 < 500ms +- Actual p95: 445ms +- Status: ✅ Met +- Compliance: 99.1% (target: 99%) +- 0.9% of requests exceeded threshold + +### Error Rate SLO: < 0.1% +- Actual: 0.05% +- Status: ✅ Met +- Error budget consumed: 50% + +## Incidents + +### Incident #1: Database Overload (Oct 5) +- Duration: 15 minutes +- Error budget consumed: 35% +- Root cause: Slow query after schema change +- Prevention: Added query review to deploy checklist + +### Incident #2: API Gateway Timeout (Oct 12) +- Duration: 5 minutes +- Error budget consumed: 10% +- Root cause: Configuration error in load balancer +- Prevention: Automated configuration validation + +### Incident #3: Upstream Service Degradation (Oct 20) +- Duration: 3 minutes +- Error budget consumed: 2% +- Root cause: Third-party API outage +- Prevention: Implemented circuit breaker + +## Recommendations +1. Investigate latency near-miss (Oct 15-17) +2. Add automated rollback for database changes +3. Increase circuit breaker thresholds for third-party APIs +4. Consider tightening availability SLO to 99.95% + +## Next Month's Focus +- Reduce p95 latency to 400ms +- Implement automated canary deployments +- Add synthetic monitoring for critical paths +``` + +--- + +## SLA Structure + +### Components + +**Service Description**: +``` +The API Service provides RESTful endpoints for user management, +authentication, and data retrieval. +``` + +**Covered Metrics**: +``` +- Availability: Service is reachable and returns valid responses +- Latency: Time from request to response +- Error Rate: Percentage of requests returning errors +``` + +**SLA Targets**: +``` +Service commits to: +1. 99.9% monthly uptime +2. p95 API response time < 1 second +3. Error rate < 0.5% +``` + +**Measurement**: +``` +Metrics calculated from server-side monitoring: +- Uptime: Successful health check probes / total probes +- Latency: Server-side request duration (p95) +- Errors: HTTP 5xx responses / total responses + +Calculated monthly (first of month for previous month). +``` + +**Exclusions**: +``` +SLA does not cover: +- Scheduled maintenance (with 7 days notice) +- Client-side network issues +- DDoS attacks or force majeure +- Beta/preview features +- Issues caused by customer misuse +``` + +**Service Credits**: +``` +Monthly Uptime | Service Credit +---------------- | -------------- +< 99.9% (SLA) | 10% +< 99.0% | 25% +< 95.0% | 50% +``` + +**Claiming Credits**: +``` +Customer must: +1. Report violation within 30 days +2. Provide ticket numbers for support requests +3. Credits applied to next month's invoice +4. Credits do not exceed monthly fee +``` + +### Example SLAs by Industry + +**E-commerce**: +``` +- 99.95% availability +- p95 page load < 2s +- p99 checkout < 5s +- Credits: 5% per 0.1% below target +``` + +**Financial Services**: +``` +- 99.99% availability +- p99 transaction < 500ms +- Zero data loss +- Penalties: $10,000 per hour of downtime +``` + +**Media/Content**: +``` +- 99.9% availability +- p95 video start < 3s +- No credit system (best effort latency) +``` + +--- + +## Best Practices + +### 1. SLOs Should Be User-Centric +❌ "Database queries < 100ms" +✅ "API response time p95 < 500ms" + +### 2. Start Loose, Tighten Over Time +- Begin with achievable targets +- Build reliability culture +- Gradually raise bar + +### 3. Fewer, Better SLOs +- 1-3 SLOs per service +- Focus on user impact +- Avoid SLO sprawl + +### 4. SLAs More Conservative Than SLOs +``` +Internal SLO: 99.95% +Customer SLA: 99.9% +Margin: 0.05% buffer +``` + +### 5. Make Error Budgets Actionable +- Define policies at different thresholds +- Empower teams to make tradeoffs +- Review in planning meetings + +### 6. Document Everything +- How SLIs are measured +- Why targets were chosen +- Who owns each SLO +- How to interpret metrics + +### 7. Review Regularly +- Monthly SLO reviews +- Quarterly SLO adjustments +- Annual SLA renegotiation + +--- + +## Common Pitfalls + +### 1. Too Many SLOs +❌ 20 different SLOs per service +✅ 2-3 critical SLOs + +### 2. Unrealistic Targets +❌ 99.999% for non-critical service +✅ 99.9% with room to improve + +### 3. SLOs Without Error Budgets +❌ "Must always be 99.9%" +✅ "Budget for 0.1% errors" + +### 4. No Consequences +❌ Missing SLO has no impact +✅ Deploy freeze when budget exhausted + +### 5. SLA Equals SLO +❌ Promise exactly what you target +✅ SLA more conservative than SLO + +### 6. Ignoring User Experience +❌ "Our servers are up 99.99%" +✅ "Users can complete actions 99.9% of the time" + +### 7. Static Targets +❌ Set once, never revisit +✅ Quarterly reviews and adjustments + +--- + +## Tools and Automation + +### SLO Tracking Tools + +**Prometheus + Grafana**: +- Use recording rules for SLIs +- Alert on burn rates +- Dashboard for compliance + +**Google Cloud SLO Monitoring**: +- Built-in SLO tracking +- Automatic error budget calculation +- Integration with alerting + +**Datadog SLOs**: +- UI for SLO definition +- Automatic burn rate alerts +- Status pages + +**Custom Tools**: +- sloth: Generate Prometheus rules from SLO definitions +- slo-libsonnet: Jsonnet library for SLO monitoring + +### Example: Prometheus Recording Rules + +```yaml +groups: + - name: sli_recording + interval: 30s + rules: + # SLI: Request success rate + - record: sli:request_success:ratio + expr: | + sum(rate(http_requests_total{status!~"5.."}[5m])) + / + sum(rate(http_requests_total[5m])) + + # SLI: Request latency (p95) + - record: sli:request_latency:p95 + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) + ) + + # Error budget burn rate (1h window) + - record: slo:error_budget_burn_rate:1h + expr: | + (1 - sli:request_success:ratio) / 0.001 +``` diff --git a/references/tool_comparison.md b/references/tool_comparison.md new file mode 100644 index 0000000..849b21a --- /dev/null +++ b/references/tool_comparison.md @@ -0,0 +1,697 @@ +# Monitoring Tools Comparison + +## Overview Matrix + +| Tool | Type | Best For | Complexity | Cost | Cloud/Self-Hosted | +|------|------|----------|------------|------|-------------------| +| **Prometheus** | Metrics | Kubernetes, time-series | Medium | Free | Self-hosted | +| **Grafana** | Visualization | Dashboards, multi-source | Low-Medium | Free | Both | +| **Datadog** | Full-stack | Ease of use, APM | Low | High | Cloud | +| **New Relic** | Full-stack | APM, traces | Low | High | Cloud | +| **Elasticsearch (ELK)** | Logs | Log search, analysis | High | Medium | Both | +| **Grafana Loki** | Logs | Cost-effective logs | Medium | Free | Both | +| **CloudWatch** | AWS-native | AWS infrastructure | Low | Medium | Cloud | +| **Jaeger** | Tracing | Distributed tracing | Medium | Free | Self-hosted | +| **Grafana Tempo** | Tracing | Cost-effective tracing | Medium | Free | Self-hosted | + +--- + +## Metrics Platforms + +### Prometheus + +**Type**: Open-source time-series database + +**Strengths**: +- ✅ Industry standard for Kubernetes +- ✅ Powerful query language (PromQL) +- ✅ Pull-based model (no agent config) +- ✅ Service discovery +- ✅ Free and open source +- ✅ Huge ecosystem (exporters for everything) + +**Weaknesses**: +- ❌ No built-in dashboards (need Grafana) +- ❌ Single-node only (no HA without federation) +- ❌ Limited long-term storage (need Thanos/Cortex) +- ❌ Steep learning curve for PromQL + +**Best For**: +- Kubernetes monitoring +- Infrastructure metrics +- Custom application metrics +- Organizations that need control + +**Pricing**: Free (open source) + +**Setup Complexity**: Medium + +**Example**: +```yaml +# prometheus.yml +scrape_configs: + - job_name: 'app' + static_configs: + - targets: ['localhost:8080'] +``` + +--- + +### Datadog + +**Type**: SaaS monitoring platform + +**Strengths**: +- ✅ Easy to set up (install agent, done) +- ✅ Beautiful pre-built dashboards +- ✅ APM, logs, metrics, traces in one platform +- ✅ Great anomaly detection +- ✅ Excellent integrations (500+) +- ✅ Good mobile app + +**Weaknesses**: +- ❌ Very expensive at scale +- ❌ Vendor lock-in +- ❌ Cost can be unpredictable (per-host pricing) +- ❌ Limited PromQL support + +**Best For**: +- Teams that want quick setup +- Companies prioritizing ease of use over cost +- Organizations needing full observability + +**Pricing**: $15-$31/host/month + custom metrics fees + +**Setup Complexity**: Low + +**Example**: +```bash +# Install agent +DD_API_KEY=xxx bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)" +``` + +--- + +### New Relic + +**Type**: SaaS application performance monitoring + +**Strengths**: +- ✅ Excellent APM capabilities +- ✅ User-friendly interface +- ✅ Good transaction tracing +- ✅ Comprehensive alerting +- ✅ Generous free tier + +**Weaknesses**: +- ❌ Can get expensive at scale +- ❌ Vendor lock-in +- ❌ Query language less powerful than PromQL +- ❌ Limited customization + +**Best For**: +- Application performance monitoring +- Teams focused on APM over infrastructure +- Startups (free tier is generous) + +**Pricing**: Free up to 100GB/month, then $0.30/GB + +**Setup Complexity**: Low + +**Example**: +```python +import newrelic.agent +newrelic.agent.initialize('newrelic.ini') +``` + +--- + +### CloudWatch + +**Type**: AWS-native monitoring + +**Strengths**: +- ✅ Zero setup for AWS services +- ✅ Native integration with AWS +- ✅ Automatic dashboards for AWS resources +- ✅ Tightly integrated with other AWS services +- ✅ Good for cost if already on AWS + +**Weaknesses**: +- ❌ AWS-only (not multi-cloud) +- ❌ Limited query capabilities +- ❌ High costs for custom metrics +- ❌ Basic visualization +- ❌ 1-minute minimum resolution + +**Best For**: +- AWS-centric infrastructure +- Quick setup for AWS services +- Organizations already invested in AWS + +**Pricing**: +- First 10 custom metrics: Free +- Additional: $0.30/metric/month +- API calls: $0.01/1000 requests + +**Setup Complexity**: Low (for AWS), Medium (for custom metrics) + +**Example**: +```python +import boto3 +cloudwatch = boto3.client('cloudwatch') +cloudwatch.put_metric_data( + Namespace='MyApp', + MetricData=[{'MetricName': 'RequestCount', 'Value': 1}] +) +``` + +--- + +### Grafana Cloud / Mimir + +**Type**: Managed Prometheus-compatible + +**Strengths**: +- ✅ Prometheus-compatible (PromQL) +- ✅ Managed service (no ops burden) +- ✅ Good cost model (pay for what you use) +- ✅ Grafana dashboards included +- ✅ Long-term storage + +**Weaknesses**: +- ❌ Relatively new (less mature) +- ❌ Some Prometheus features missing +- ❌ Requires Grafana for visualization + +**Best For**: +- Teams wanting Prometheus without ops overhead +- Multi-cloud environments +- Organizations already using Grafana + +**Pricing**: $8/month + $0.29/1M samples + +**Setup Complexity**: Low-Medium + +--- + +## Logging Platforms + +### Elasticsearch (ELK Stack) + +**Type**: Open-source log search and analytics + +**Full Stack**: Elasticsearch + Logstash + Kibana + +**Strengths**: +- ✅ Powerful search capabilities +- ✅ Rich query language +- ✅ Great for log analysis +- ✅ Mature ecosystem +- ✅ Can handle large volumes +- ✅ Flexible data model + +**Weaknesses**: +- ❌ Complex to operate +- ❌ Resource intensive (RAM hungry) +- ❌ Expensive at scale +- ❌ Requires dedicated ops team +- ❌ Slow for high-cardinality queries + +**Best For**: +- Large organizations with ops teams +- Deep log analysis needs +- Search-heavy use cases + +**Pricing**: Free (open source) + infrastructure costs + +**Infrastructure**: ~$500-2000/month for medium scale + +**Setup Complexity**: High + +**Example**: +```json +PUT /logs-2024.10/_doc/1 +{ + "timestamp": "2024-10-28T14:32:15Z", + "level": "error", + "message": "Payment failed" +} +``` + +--- + +### Grafana Loki + +**Type**: Log aggregation system + +**Strengths**: +- ✅ Cost-effective (labels only, not full-text indexing) +- ✅ Easy to operate +- ✅ Prometheus-like label model +- ✅ Great Grafana integration +- ✅ Low resource usage +- ✅ Fast time-range queries + +**Weaknesses**: +- ❌ Limited full-text search +- ❌ Requires careful label design +- ❌ Younger ecosystem than ELK +- ❌ Not ideal for complex queries + +**Best For**: +- Cost-conscious organizations +- Kubernetes environments +- Teams already using Prometheus +- Time-series log queries + +**Pricing**: Free (open source) + infrastructure costs + +**Infrastructure**: ~$100-500/month for medium scale + +**Setup Complexity**: Medium + +**Example**: +```logql +{job="api", environment="prod"} |= "error" | json | level="error" +``` + +--- + +### Splunk + +**Type**: Enterprise log management + +**Strengths**: +- ✅ Extremely powerful search +- ✅ Great for security/compliance +- ✅ Mature platform +- ✅ Enterprise support +- ✅ Machine learning features + +**Weaknesses**: +- ❌ Very expensive +- ❌ Complex pricing (per GB ingested) +- ❌ Steep learning curve +- ❌ Heavy resource usage + +**Best For**: +- Large enterprises +- Security operations centers (SOCs) +- Compliance-heavy industries + +**Pricing**: $150-$1800/GB/month (depending on tier) + +**Setup Complexity**: Medium-High + +--- + +### CloudWatch Logs + +**Type**: AWS-native log management + +**Strengths**: +- ✅ Zero setup for AWS services +- ✅ Integrated with AWS ecosystem +- ✅ CloudWatch Insights for queries +- ✅ Reasonable cost for low volume + +**Weaknesses**: +- ❌ AWS-only +- ❌ Limited query capabilities +- ❌ Expensive at high volume +- ❌ Basic visualization + +**Best For**: +- AWS-centric applications +- Low-volume logging +- Simple log aggregation + +**Pricing**: Tiered (as of May 2025) +- Vended Logs: $0.50/GB (first 10TB), $0.25/GB (next 20TB), then lower tiers +- Standard logs: $0.50/GB flat +- Storage: $0.03/GB + +**Setup Complexity**: Low (AWS), Medium (custom) + +--- + +### Sumo Logic + +**Type**: SaaS log management + +**Strengths**: +- ✅ Easy to use +- ✅ Good for cloud-native apps +- ✅ Real-time analytics +- ✅ Good compliance features + +**Weaknesses**: +- ❌ Expensive at scale +- ❌ Vendor lock-in +- ❌ Limited customization + +**Best For**: +- Cloud-native applications +- Teams wanting managed solution +- Security and compliance use cases + +**Pricing**: $90-$180/GB/month + +**Setup Complexity**: Low + +--- + +## Tracing Platforms + +### Jaeger + +**Type**: Open-source distributed tracing + +**Strengths**: +- ✅ Industry standard +- ✅ CNCF graduated project +- ✅ Supports OpenTelemetry +- ✅ Good UI +- ✅ Free and open source + +**Weaknesses**: +- ❌ Requires separate storage backend +- ❌ Limited query capabilities +- ❌ No built-in analytics + +**Best For**: +- Microservices architectures +- Kubernetes environments +- OpenTelemetry users + +**Pricing**: Free (open source) + storage costs + +**Setup Complexity**: Medium + +--- + +### Grafana Tempo + +**Type**: Open-source distributed tracing + +**Strengths**: +- ✅ Cost-effective (object storage) +- ✅ Easy to operate +- ✅ Great Grafana integration +- ✅ TraceQL query language +- ✅ Supports OpenTelemetry + +**Weaknesses**: +- ❌ Younger than Jaeger +- ❌ Limited third-party integrations +- ❌ Requires Grafana for UI + +**Best For**: +- Cost-conscious organizations +- Teams using Grafana stack +- High trace volumes + +**Pricing**: Free (open source) + storage costs + +**Setup Complexity**: Medium + +--- + +### Datadog APM + +**Type**: SaaS application performance monitoring + +**Strengths**: +- ✅ Easy to set up +- ✅ Excellent trace visualization +- ✅ Integrated with metrics/logs +- ✅ Automatic service map +- ✅ Good profiling features + +**Weaknesses**: +- ❌ Expensive ($31/host/month) +- ❌ Vendor lock-in +- ❌ Limited sampling control + +**Best For**: +- Teams wanting ease of use +- Organizations already using Datadog +- Complex microservices + +**Pricing**: $31/host/month + $1.70/million spans + +**Setup Complexity**: Low + +--- + +### AWS X-Ray + +**Type**: AWS-native distributed tracing + +**Strengths**: +- ✅ Native AWS integration +- ✅ Automatic instrumentation for AWS services +- ✅ Low cost + +**Weaknesses**: +- ❌ AWS-only +- ❌ Basic UI +- ❌ Limited query capabilities + +**Best For**: +- AWS-centric applications +- Serverless architectures (Lambda) +- Cost-sensitive projects + +**Pricing**: $5/million traces, first 100k free/month + +**Setup Complexity**: Low (AWS), Medium (custom) + +--- + +## Full-Stack Observability + +### Datadog (Full Platform) + +**Components**: Metrics, logs, traces, RUM, synthetics + +**Strengths**: +- ✅ Everything in one platform +- ✅ Excellent user experience +- ✅ Correlation across signals +- ✅ Great for teams + +**Weaknesses**: +- ❌ Very expensive ($50-100+/host/month) +- ❌ Vendor lock-in +- ❌ Unpredictable costs + +**Total Cost** (example 100 hosts): +- Infrastructure: $3,100/month +- APM: $3,100/month +- Logs: ~$2,000/month +- **Total: ~$8,000/month** + +--- + +### Grafana Stack (LGTM) + +**Components**: Loki (logs), Grafana (viz), Tempo (traces), Mimir/Prometheus (metrics) + +**Strengths**: +- ✅ Open source and cost-effective +- ✅ Unified visualization +- ✅ Prometheus-compatible +- ✅ Great for cloud-native + +**Weaknesses**: +- ❌ Requires self-hosting or Grafana Cloud +- ❌ More ops burden +- ❌ Less polished than commercial tools + +**Total Cost** (self-hosted, 100 hosts): +- Infrastructure: ~$1,500/month +- Ops time: Variable +- **Total: ~$1,500-3,000/month** + +--- + +### Elastic Observability + +**Components**: Elasticsearch (logs), Kibana (viz), APM, metrics + +**Strengths**: +- ✅ Powerful search +- ✅ Mature platform +- ✅ Good for log-heavy use cases + +**Weaknesses**: +- ❌ Complex to operate +- ❌ Expensive infrastructure +- ❌ Resource intensive + +**Total Cost** (self-hosted, 100 hosts): +- Infrastructure: ~$3,000-5,000/month +- Ops time: High +- **Total: ~$4,000-7,000/month** + +--- + +### New Relic One + +**Components**: Metrics, logs, traces, synthetics + +**Strengths**: +- ✅ Generous free tier (100GB) +- ✅ User-friendly +- ✅ Good for startups + +**Weaknesses**: +- ❌ Costs increase quickly after free tier +- ❌ Vendor lock-in + +**Total Cost**: +- Free: up to 100GB/month +- Paid: $0.30/GB beyond 100GB + +--- + +## Cloud Provider Native + +### AWS (CloudWatch + X-Ray) + +**Use When**: +- Primarily on AWS +- Simple monitoring needs +- Want minimal setup + +**Avoid When**: +- Multi-cloud environment +- Need advanced features +- High log volume (expensive) + +**Cost** (example): +- 100 EC2 instances with basic metrics: ~$150/month +- 1TB logs: ~$500/month ingestion + storage +- X-Ray: ~$50/month + +--- + +### GCP (Cloud Monitoring + Cloud Trace) + +**Use When**: +- Primarily on GCP +- Using GKE +- Want tight GCP integration + +**Avoid When**: +- Multi-cloud environment +- Need advanced querying + +**Cost** (example): +- First 150MB/month per resource: Free +- Additional: $0.2508/MB + +--- + +### Azure (Azure Monitor) + +**Use When**: +- Primarily on Azure +- Using AKS +- Need Azure integration + +**Avoid When**: +- Multi-cloud +- Need advanced features + +**Cost** (example): +- First 5GB: Free +- Additional: $2.76/GB + +--- + +## Decision Matrix + +### Choose Prometheus + Grafana If: +- ✅ Using Kubernetes +- ✅ Want control and customization +- ✅ Have ops capacity +- ✅ Budget-conscious +- ✅ Need Prometheus ecosystem + +### Choose Datadog If: +- ✅ Want ease of use +- ✅ Need full observability now +- ✅ Budget allows ($8k+/month for 100 hosts) +- ✅ Limited ops team +- ✅ Need excellent UX + +### Choose ELK If: +- ✅ Heavy log analysis needs +- ✅ Need powerful search +- ✅ Have dedicated ops team +- ✅ Compliance requirements +- ✅ Willing to invest in infrastructure + +### Choose Grafana Stack (LGTM) If: +- ✅ Want open source full stack +- ✅ Cost-effective solution +- ✅ Cloud-native architecture +- ✅ Already using Prometheus +- ✅ Have some ops capacity + +### Choose New Relic If: +- ✅ Startup with free tier +- ✅ APM is priority +- ✅ Want easy setup +- ✅ Don't need heavy customization + +### Choose Cloud Native (CloudWatch/etc) If: +- ✅ Single cloud provider +- ✅ Simple needs +- ✅ Want minimal setup +- ✅ Low to medium scale + +--- + +## Cost Comparison + +**Example: 100 hosts, 1TB logs/month, 1M spans/day** + +| Solution | Monthly Cost | Setup | Ops Burden | +|----------|-------------|--------|------------| +| **Prometheus + Loki + Tempo** | $1,500 | Medium | Medium | +| **Grafana Cloud** | $3,000 | Low | Low | +| **Datadog** | $8,000 | Low | None | +| **New Relic** | $3,500 | Low | None | +| **ELK Stack** | $4,000 | High | High | +| **CloudWatch** | $2,000 | Low | Low | + +--- + +## Recommendations by Company Size + +### Startup (< 10 engineers) +**Recommendation**: New Relic or Grafana Cloud +- Minimal ops burden +- Good free tiers +- Easy to get started + +### Small Company (10-50 engineers) +**Recommendation**: Prometheus + Grafana + Loki (self-hosted or cloud) +- Cost-effective +- Growing ops capacity +- Flexibility + +### Medium Company (50-200 engineers) +**Recommendation**: Datadog or Grafana Stack +- Datadog if budget allows +- Grafana Stack if cost-conscious + +### Large Enterprise (200+ engineers) +**Recommendation**: Build observability platform +- Mix of tools based on needs +- Dedicated observability team +- Custom integrations diff --git a/references/tracing_guide.md b/references/tracing_guide.md new file mode 100644 index 0000000..7e72fcb --- /dev/null +++ b/references/tracing_guide.md @@ -0,0 +1,663 @@ +# Distributed Tracing Guide + +## What is Distributed Tracing? + +Distributed tracing tracks a request as it flows through multiple services in a distributed system. + +### Key Concepts + +**Trace**: End-to-end journey of a request +**Span**: Single operation within a trace +**Context**: Metadata propagated between services (trace_id, span_id) + +### Example Flow +``` +User Request → API Gateway → Auth Service → User Service → Database + ↓ ↓ ↓ + [Trace ID: abc123] + Span 1: gateway (50ms) + Span 2: auth (20ms) + Span 3: user_service (100ms) + Span 4: db_query (80ms) + +Total: 250ms with waterfall view showing dependencies +``` + +--- + +## OpenTelemetry (OTel) + +OpenTelemetry is the industry standard for instrumentation. + +### Components + +**API**: Instrument code (create spans, add attributes) +**SDK**: Implement API, configure exporters +**Collector**: Receive, process, and export telemetry data +**Exporters**: Send data to backends (Jaeger, Tempo, Zipkin) + +### Architecture +``` +Application → OTel SDK → OTel Collector → Backend (Jaeger/Tempo) + ↓ + Visualization +``` + +--- + +## Instrumentation Examples + +### Python (using OpenTelemetry) + +**Setup**: +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +# Setup tracer +trace.set_tracer_provider(TracerProvider()) +tracer = trace.get_tracer(__name__) + +# Configure exporter +otlp_exporter = OTLPSpanExporter(endpoint="localhost:4317") +span_processor = BatchSpanProcessor(otlp_exporter) +trace.get_tracer_provider().add_span_processor(span_processor) +``` + +**Manual instrumentation**: +```python +from opentelemetry import trace + +tracer = trace.get_tracer(__name__) + +@tracer.start_as_current_span("process_order") +def process_order(order_id): + span = trace.get_current_span() + span.set_attribute("order.id", order_id) + span.set_attribute("order.amount", 99.99) + + try: + result = payment_service.charge(order_id) + span.set_attribute("payment.status", "success") + return result + except Exception as e: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + span.record_exception(e) + raise +``` + +**Auto-instrumentation** (Flask example): +```python +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor + +# Auto-instrument Flask +FlaskInstrumentor().instrument_app(app) + +# Auto-instrument requests library +RequestsInstrumentor().instrument() + +# Auto-instrument SQLAlchemy +SQLAlchemyInstrumentor().instrument(engine=db.engine) +``` + +### Node.js (using OpenTelemetry) + +**Setup**: +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); + +// Setup provider +const provider = new NodeTracerProvider(); +const exporter = new OTLPTraceExporter({ url: 'localhost:4317' }); +provider.addSpanProcessor(new BatchSpanProcessor(exporter)); +provider.register(); +``` + +**Manual instrumentation**: +```javascript +const tracer = provider.getTracer('my-service'); + +async function processOrder(orderId) { + const span = tracer.startSpan('process_order'); + span.setAttribute('order.id', orderId); + + try { + const result = await paymentService.charge(orderId); + span.setAttribute('payment.status', 'success'); + return result; + } catch (error) { + span.setStatus({ code: SpanStatusCode.ERROR }); + span.recordException(error); + throw error; + } finally { + span.end(); + } +} +``` + +**Auto-instrumentation**: +```javascript +const { registerInstrumentations } = require('@opentelemetry/instrumentation'); +const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http'); +const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express'); +const { MongoDBInstrumentation } = require('@opentelemetry/instrumentation-mongodb'); + +registerInstrumentations({ + instrumentations: [ + new HttpInstrumentation(), + new ExpressInstrumentation(), + new MongoDBInstrumentation() + ] +}); +``` + +### Go (using OpenTelemetry) + +**Setup**: +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/trace" +) + +func initTracer() { + exporter, _ := otlptracegrpc.New(context.Background()) + tp := trace.NewTracerProvider( + trace.WithBatcher(exporter), + ) + otel.SetTracerProvider(tp) +} +``` + +**Manual instrumentation**: +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" +) + +func processOrder(ctx context.Context, orderID string) error { + tracer := otel.Tracer("my-service") + ctx, span := tracer.Start(ctx, "process_order") + defer span.End() + + span.SetAttributes( + attribute.String("order.id", orderID), + attribute.Float64("order.amount", 99.99), + ) + + err := paymentService.Charge(ctx, orderID) + if err != nil { + span.RecordError(err) + return err + } + + span.SetAttributes(attribute.String("payment.status", "success")) + return nil +} +``` + +--- + +## Span Attributes + +### Semantic Conventions + +Follow OpenTelemetry semantic conventions for consistency: + +**HTTP**: +```python +span.set_attribute("http.method", "GET") +span.set_attribute("http.url", "https://api.example.com/users") +span.set_attribute("http.status_code", 200) +span.set_attribute("http.user_agent", "Mozilla/5.0...") +``` + +**Database**: +```python +span.set_attribute("db.system", "postgresql") +span.set_attribute("db.name", "users_db") +span.set_attribute("db.statement", "SELECT * FROM users WHERE id = ?") +span.set_attribute("db.operation", "SELECT") +``` + +**RPC/gRPC**: +```python +span.set_attribute("rpc.system", "grpc") +span.set_attribute("rpc.service", "UserService") +span.set_attribute("rpc.method", "GetUser") +span.set_attribute("rpc.grpc.status_code", 0) +``` + +**Messaging**: +```python +span.set_attribute("messaging.system", "kafka") +span.set_attribute("messaging.destination", "user-events") +span.set_attribute("messaging.operation", "publish") +span.set_attribute("messaging.message_id", "msg123") +``` + +### Custom Attributes + +Add business context: +```python +span.set_attribute("user.id", "user123") +span.set_attribute("order.id", "ORD-456") +span.set_attribute("feature.flag.checkout_v2", True) +span.set_attribute("cache.hit", False) +``` + +--- + +## Context Propagation + +### W3C Trace Context (Standard) + +Headers propagated between services: +``` +traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01 +tracestate: vendor1=value1,vendor2=value2 +``` + +**Format**: `version-trace_id-parent_span_id-trace_flags` + +### Implementation + +**Python**: +```python +from opentelemetry.propagate import inject, extract +import requests + +# Inject context into outgoing request +headers = {} +inject(headers) +requests.get("https://api.example.com", headers=headers) + +# Extract context from incoming request +from flask import request +ctx = extract(request.headers) +``` + +**Node.js**: +```javascript +const { propagation } = require('@opentelemetry/api'); + +// Inject +const headers = {}; +propagation.inject(context.active(), headers); +axios.get('https://api.example.com', { headers }); + +// Extract +const ctx = propagation.extract(context.active(), req.headers); +``` + +**HTTP Example**: +```bash +curl -H "traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01" \ + https://api.example.com/users +``` + +--- + +## Sampling Strategies + +### 1. Always On/Off +```python +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ALWAYS_OFF + +# Development: trace everything +provider = TracerProvider(sampler=ALWAYS_ON) + +# Production: trace nothing (usually not desired) +provider = TracerProvider(sampler=ALWAYS_OFF) +``` + +### 2. Probability-Based +```python +from opentelemetry.sdk.trace.sampling import TraceIdRatioBased + +# Sample 10% of traces +provider = TracerProvider(sampler=TraceIdRatioBased(0.1)) +``` + +### 3. Rate Limiting +```python +from opentelemetry.sdk.trace.sampling import ParentBased, RateLimitingSampler + +# Sample max 100 traces per second +sampler = ParentBased(root=RateLimitingSampler(100)) +provider = TracerProvider(sampler=sampler) +``` + +### 4. Parent-Based (Default) +```python +from opentelemetry.sdk.trace.sampling import ParentBased, TraceIdRatioBased + +# If parent span is sampled, sample child spans +sampler = ParentBased(root=TraceIdRatioBased(0.1)) +provider = TracerProvider(sampler=sampler) +``` + +### 5. Custom Sampling +```python +from opentelemetry.sdk.trace.sampling import Sampler, Decision + +class ErrorSampler(Sampler): + """Always sample errors, sample 1% of successes""" + + def should_sample(self, parent_context, trace_id, name, **kwargs): + attributes = kwargs.get('attributes', {}) + + # Always sample if error + if attributes.get('error', False): + return Decision.RECORD_AND_SAMPLE + + # Sample 1% of successes + if trace_id & 0xFF < 3: # ~1% + return Decision.RECORD_AND_SAMPLE + + return Decision.DROP + +provider = TracerProvider(sampler=ErrorSampler()) +``` + +--- + +## Backends + +### Jaeger + +**Docker Compose**: +```yaml +version: '3' +services: + jaeger: + image: jaegertracing/all-in-one:latest + ports: + - "16686:16686" # UI + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + environment: + - COLLECTOR_OTLP_ENABLED=true +``` + +**Query traces**: +```bash +# UI: http://localhost:16686 + +# API: Get trace by ID +curl http://localhost:16686/api/traces/abc123 + +# Search traces +curl "http://localhost:16686/api/traces?service=my-service&limit=20" +``` + +### Grafana Tempo + +**Docker Compose**: +```yaml +version: '3' +services: + tempo: + image: grafana/tempo:latest + ports: + - "3200:3200" # Tempo + - "4317:4317" # OTLP gRPC + volumes: + - ./tempo.yaml:/etc/tempo.yaml + command: ["-config.file=/etc/tempo.yaml"] +``` + +**tempo.yaml**: +```yaml +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +storage: + trace: + backend: local + local: + path: /tmp/tempo/traces +``` + +**Query in Grafana**: +- Install Tempo data source +- Use TraceQL: `{ span.http.status_code = 500 }` + +### AWS X-Ray + +**Configuration**: +```python +from aws_xray_sdk.core import xray_recorder +from aws_xray_sdk.ext.flask.middleware import XRayMiddleware + +xray_recorder.configure(service='my-service') +XRayMiddleware(app, xray_recorder) +``` + +**Query**: +```bash +aws xray get-trace-summaries \ + --start-time 2024-10-28T00:00:00 \ + --end-time 2024-10-28T23:59:59 \ + --filter-expression 'error = true' +``` + +--- + +## Analysis Patterns + +### Find Slow Traces +``` +# Jaeger UI +- Filter by service +- Set min duration: 1000ms +- Sort by duration + +# TraceQL (Tempo) +{ duration > 1s } +``` + +### Find Error Traces +``` +# Jaeger UI +- Filter by tag: error=true +- Or by HTTP status: http.status_code=500 + +# TraceQL (Tempo) +{ span.http.status_code >= 500 } +``` + +### Find Traces by User +``` +# Jaeger UI +- Filter by tag: user.id=user123 + +# TraceQL (Tempo) +{ span.user.id = "user123" } +``` + +### Find N+1 Query Problems +Look for: +- Many sequential database spans +- Same query repeated multiple times +- Pattern: API call → DB query → DB query → DB query... + +### Find Service Bottlenecks +- Identify spans with longest duration +- Check if time is spent in service logic or waiting for dependencies +- Look at span relationships (parallel vs sequential) + +--- + +## Integration with Logs + +### Trace ID in Logs + +**Python**: +```python +from opentelemetry import trace + +def add_trace_context(): + span = trace.get_current_span() + trace_id = span.get_span_context().trace_id + span_id = span.get_span_context().span_id + + return { + "trace_id": format(trace_id, '032x'), + "span_id": format(span_id, '016x') + } + +logger.info("Processing order", **add_trace_context(), order_id=order_id) +``` + +**Query logs for trace**: +``` +# Elasticsearch +GET /logs/_search +{ + "query": { + "match": { "trace_id": "0af7651916cd43dd8448eb211c80319c" } + } +} + +# Loki (LogQL) +{job="app"} |= "0af7651916cd43dd8448eb211c80319c" +``` + +### Trace from Log (Grafana) + +Configure derived fields in Grafana: +```yaml +datasources: + - name: Loki + type: loki + jsonData: + derivedFields: + - name: TraceID + matcherRegex: "trace_id=([\\w]+)" + url: "http://tempo:3200/trace/$${__value.raw}" + datasourceUid: tempo_uid +``` + +--- + +## Best Practices + +### 1. Span Naming +✅ Use operation names, not IDs +- Good: `GET /api/users`, `UserService.GetUser`, `db.query.users` +- Bad: `/api/users/123`, `span_abc`, `query_1` + +### 2. Span Granularity +✅ One span per logical operation +- Too coarse: One span for entire request +- Too fine: Span for every variable assignment +- Just right: Span per service call, database query, external API + +### 3. Add Context +Always include: +- Operation name +- Service name +- Error status +- Business identifiers (user_id, order_id) + +### 4. Handle Errors +```python +try: + result = operation() +except Exception as e: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + span.record_exception(e) + raise +``` + +### 5. Sampling Strategy +- Development: 100% +- Staging: 50-100% +- Production: 1-10% (or error-based) + +### 6. Performance Impact +- Overhead: ~1-5% CPU +- Use async exporters +- Batch span exports +- Sample appropriately + +### 7. Cardinality +Avoid high-cardinality attributes: +- ❌ Email addresses +- ❌ Full URLs with unique IDs +- ❌ Timestamps +- ✅ User ID +- ✅ Endpoint pattern +- ✅ Status code + +--- + +## Common Issues + +### Missing Traces +**Cause**: Context not propagated +**Solution**: Verify headers are injected/extracted + +### Incomplete Traces +**Cause**: Spans not closed properly +**Solution**: Always use `defer span.End()` or context managers + +### High Overhead +**Cause**: Too many spans or synchronous export +**Solution**: Reduce span count, use batch processor + +### No Error Traces +**Cause**: Errors not recorded on spans +**Solution**: Call `span.record_exception()` and set error status + +--- + +## Metrics from Traces + +Generate RED metrics from trace data: + +**Rate**: Traces per second +**Errors**: Traces with error status +**Duration**: Span duration percentiles + +**Example** (using Tempo + Prometheus): +```yaml +# Generate metrics from spans +metrics_generator: + processor: + span_metrics: + dimensions: + - http.method + - http.status_code +``` + +**Query**: +```promql +# Request rate +rate(traces_spanmetrics_calls_total[5m]) + +# Error rate +rate(traces_spanmetrics_calls_total{status_code="STATUS_CODE_ERROR"}[5m]) + / +rate(traces_spanmetrics_calls_total[5m]) + +# P95 latency +histogram_quantile(0.95, traces_spanmetrics_latency_bucket) +``` diff --git a/scripts/alert_quality_checker.py b/scripts/alert_quality_checker.py new file mode 100644 index 0000000..d926bb5 --- /dev/null +++ b/scripts/alert_quality_checker.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +""" +Audit Prometheus alert rules against best practices. +Checks for: alert naming, severity labels, runbook links, expression quality. +""" + +import argparse +import sys +import os +import re +from typing import Dict, List, Any +from pathlib import Path + +try: + import yaml +except ImportError: + print("⚠️ Warning: 'PyYAML' library not found. Install with: pip install pyyaml") + sys.exit(1) + + +class AlertQualityChecker: + def __init__(self): + self.issues = [] + self.warnings = [] + self.recommendations = [] + + def check_alert_name(self, alert_name: str) -> List[str]: + """Check alert naming conventions.""" + issues = [] + + # Should be PascalCase or camelCase + if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name): + issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)") + + # Should be descriptive + if len(alert_name) < 5: + issues.append(f"Alert name '{alert_name}' is too short, use descriptive names") + + # Avoid generic names + generic_names = ['Alert', 'Test', 'Warning', 'Error'] + if alert_name in generic_names: + issues.append(f"Alert name '{alert_name}' is too generic") + + return issues + + def check_labels(self, alert: Dict[str, Any]) -> List[str]: + """Check required and recommended labels.""" + issues = [] + labels = alert.get('labels', {}) + + # Required labels + if 'severity' not in labels: + issues.append("Missing required 'severity' label (critical/warning/info)") + elif labels['severity'] not in ['critical', 'warning', 'info']: + issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info") + + # Recommended labels + if 'team' not in labels: + self.recommendations.append("Consider adding 'team' label for routing") + + if 'component' not in labels and 'service' not in labels: + self.recommendations.append("Consider adding 'component' or 'service' label") + + return issues + + def check_annotations(self, alert: Dict[str, Any]) -> List[str]: + """Check annotations quality.""" + issues = [] + annotations = alert.get('annotations', {}) + + # Required annotations + if 'summary' not in annotations: + issues.append("Missing 'summary' annotation") + elif len(annotations['summary']) < 10: + issues.append("Summary annotation is too short, provide clear description") + + if 'description' not in annotations: + issues.append("Missing 'description' annotation") + + # Runbook + if 'runbook_url' not in annotations and 'runbook' not in annotations: + self.recommendations.append("Consider adding 'runbook_url' for incident response") + + # Check for templating + if 'summary' in annotations: + if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']: + self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})") + + return issues + + def check_expression(self, expr: str, alert_name: str) -> List[str]: + """Check PromQL expression quality.""" + issues = [] + + # Should have a threshold + if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr: + issues.append("Expression should include a comparison operator") + + # Should use rate() for counters + if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr: + self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)") + + # Avoid instant queries without aggregation + if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']): + if expr.count('{') > 1: # Multiple metrics without aggregation + self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.") + + # Check for proper time windows + if '[' not in expr and 'rate(' in expr: + issues.append("rate() requires a time window (e.g., rate(metric[5m]))") + + return issues + + def check_for_duration(self, rule: Dict[str, Any]) -> List[str]: + """Check for 'for' clause to prevent flapping.""" + issues = [] + severity = rule.get('labels', {}).get('severity', 'unknown') + + if 'for' not in rule: + if severity == 'critical': + issues.append("Critical alerts should have 'for' clause to prevent flapping") + else: + self.warnings.append("Consider adding 'for' clause to prevent alert flapping") + else: + # Parse duration + duration = rule['for'] + if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']): + self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts") + + return issues + + def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]: + """Check a single alert rule.""" + alert_name = rule.get('alert', 'Unknown') + issues = [] + + # Check alert name + issues.extend(self.check_alert_name(alert_name)) + + # Check expression + if 'expr' not in rule: + issues.append("Missing 'expr' field") + else: + issues.extend(self.check_expression(rule['expr'], alert_name)) + + # Check labels + issues.extend(self.check_labels(rule)) + + # Check annotations + issues.extend(self.check_annotations(rule)) + + # Check for duration + issues.extend(self.check_for_duration(rule)) + + return { + "alert": alert_name, + "issues": issues, + "severity": rule.get('labels', {}).get('severity', 'unknown') + } + + def analyze_file(self, filepath: str) -> Dict[str, Any]: + """Analyze a Prometheus rules file.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if not data: + return {"error": "Empty or invalid YAML file"} + + results = [] + groups = data.get('groups', []) + + for group in groups: + group_name = group.get('name', 'Unknown') + rules = group.get('rules', []) + + for rule in rules: + # Only check alerting rules, not recording rules + if 'alert' in rule: + result = self.check_alert_rule(rule) + result['group'] = group_name + results.append(result) + + return { + "file": filepath, + "groups": len(groups), + "alerts_checked": len(results), + "results": results + } + + except Exception as e: + return {"error": f"Failed to parse file: {e}"} + + +def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker): + """Pretty print analysis results.""" + print("\n" + "="*60) + print("🚨 ALERT QUALITY CHECK RESULTS") + print("="*60) + + if "error" in analysis: + print(f"\n❌ Error: {analysis['error']}") + return + + print(f"\n📁 File: {analysis['file']}") + print(f"📊 Groups: {analysis['groups']}") + print(f"🔔 Alerts Checked: {analysis['alerts_checked']}") + + # Count issues by severity + critical_count = 0 + warning_count = 0 + + for result in analysis['results']: + if result['issues']: + critical_count += 1 + + print(f"\n{'='*60}") + print(f"📈 Summary:") + print(f" ❌ Alerts with Issues: {critical_count}") + print(f" ⚠️ Warnings: {len(checker.warnings)}") + print(f" 💡 Recommendations: {len(checker.recommendations)}") + + # Print detailed results + if critical_count > 0: + print(f"\n{'='*60}") + print("❌ ALERTS WITH ISSUES:") + print(f"{'='*60}") + + for result in analysis['results']: + if result['issues']: + print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})") + print(f" Severity: {result['severity']}") + print(" Issues:") + for issue in result['issues']: + print(f" • {issue}") + + # Print warnings + if checker.warnings: + print(f"\n{'='*60}") + print("⚠️ WARNINGS:") + print(f"{'='*60}") + for warning in set(checker.warnings): # Remove duplicates + print(f"• {warning}") + + # Print recommendations + if checker.recommendations: + print(f"\n{'='*60}") + print("💡 RECOMMENDATIONS:") + print(f"{'='*60}") + for rec in list(set(checker.recommendations))[:10]: # Top 10 unique recommendations + print(f"• {rec}") + + # Overall score + total_alerts = analysis['alerts_checked'] + if total_alerts > 0: + quality_score = ((total_alerts - critical_count) / total_alerts) * 100 + print(f"\n{'='*60}") + print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)") + print(f"{'='*60}\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Audit Prometheus alert rules for quality and best practices", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check a single file + python3 alert_quality_checker.py alerts.yml + + # Check all YAML files in a directory + python3 alert_quality_checker.py /path/to/prometheus/rules/ + +Best Practices Checked: + ✓ Alert naming conventions (PascalCase, descriptive) + ✓ Required labels (severity) + ✓ Required annotations (summary, description) + ✓ Runbook URL presence + ✓ PromQL expression quality + ✓ 'for' clause to prevent flapping + ✓ Template variable usage + """ + ) + + parser.add_argument('path', help='Path to alert rules file or directory') + parser.add_argument('--verbose', action='store_true', help='Show all recommendations') + + args = parser.parse_args() + + checker = AlertQualityChecker() + + # Check if path is file or directory + path = Path(args.path) + + if path.is_file(): + files = [str(path)] + elif path.is_dir(): + files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')] + else: + print(f"❌ Path not found: {args.path}") + sys.exit(1) + + if not files: + print(f"❌ No YAML files found in: {args.path}") + sys.exit(1) + + print(f"🔍 Checking {len(files)} file(s)...") + + for filepath in files: + analysis = checker.analyze_file(filepath) + print_results(analysis, checker) + + +if __name__ == "__main__": + main() diff --git a/scripts/analyze_metrics.py b/scripts/analyze_metrics.py new file mode 100644 index 0000000..58343bd --- /dev/null +++ b/scripts/analyze_metrics.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +""" +Analyze metrics from Prometheus or CloudWatch and detect anomalies. +Supports: rate of change analysis, spike detection, trend analysis. +""" + +import argparse +import sys +import json +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +import statistics + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + +try: + import boto3 +except ImportError: + boto3 = None + + +class MetricAnalyzer: + def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"): + self.source = source + self.endpoint = endpoint + self.region = region + if source == "cloudwatch" and boto3: + self.cloudwatch = boto3.client('cloudwatch', region_name=region) + elif source == "cloudwatch" and not boto3: + print("⚠️ boto3 not installed. Install with: pip install boto3") + sys.exit(1) + + def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]: + """Query Prometheus for metric data.""" + if not self.endpoint: + print("❌ Prometheus endpoint required") + sys.exit(1) + + try: + # Query range for last N hours + end_time = datetime.now() + start_time = end_time - timedelta(hours=hours) + + params = { + 'query': query, + 'start': start_time.timestamp(), + 'end': end_time.timestamp(), + 'step': '5m' # 5-minute resolution + } + + response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30) + response.raise_for_status() + + data = response.json() + if data['status'] != 'success': + print(f"❌ Prometheus query failed: {data}") + return [] + + return data['data']['result'] + + except Exception as e: + print(f"❌ Error querying Prometheus: {e}") + return [] + + def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str], + hours: int = 24, stat: str = "Average") -> List[Dict]: + """Query CloudWatch for metric data.""" + try: + end_time = datetime.now() + start_time = end_time - timedelta(hours=hours) + + dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()] + + response = self.cloudwatch.get_metric_statistics( + Namespace=namespace, + MetricName=metric_name, + Dimensions=dimensions_list, + StartTime=start_time, + EndTime=end_time, + Period=300, # 5-minute intervals + Statistics=[stat] + ) + + return sorted(response['Datapoints'], key=lambda x: x['Timestamp']) + + except Exception as e: + print(f"❌ Error querying CloudWatch: {e}") + return [] + + def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]: + """Detect anomalies using standard deviation method.""" + if len(values) < 10: + return { + "anomalies_detected": False, + "message": "Insufficient data points for anomaly detection" + } + + mean = statistics.mean(values) + stdev = statistics.stdev(values) + threshold_upper = mean + (sensitivity * stdev) + threshold_lower = mean - (sensitivity * stdev) + + anomalies = [] + for i, value in enumerate(values): + if value > threshold_upper or value < threshold_lower: + anomalies.append({ + "index": i, + "value": value, + "deviation": abs(value - mean) / stdev if stdev > 0 else 0 + }) + + return { + "anomalies_detected": len(anomalies) > 0, + "count": len(anomalies), + "anomalies": anomalies, + "stats": { + "mean": mean, + "stdev": stdev, + "threshold_upper": threshold_upper, + "threshold_lower": threshold_lower, + "total_points": len(values) + } + } + + def analyze_trend(self, values: List[float]) -> Dict[str, Any]: + """Analyze trend using simple linear regression.""" + if len(values) < 2: + return {"trend": "unknown", "message": "Insufficient data"} + + n = len(values) + x = list(range(n)) + x_mean = sum(x) / n + y_mean = sum(values) / n + + numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n)) + denominator = sum((x[i] - x_mean) ** 2 for i in range(n)) + + if denominator == 0: + return {"trend": "flat", "slope": 0} + + slope = numerator / denominator + + # Determine trend direction + if abs(slope) < 0.01 * y_mean: # Less than 1% change per interval + trend = "stable" + elif slope > 0: + trend = "increasing" + else: + trend = "decreasing" + + return { + "trend": trend, + "slope": slope, + "rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0 + } + + +def print_results(results: Dict[str, Any]): + """Pretty print analysis results.""" + print("\n" + "="*60) + print("📊 METRIC ANALYSIS RESULTS") + print("="*60) + + if "error" in results: + print(f"\n❌ Error: {results['error']}") + return + + print(f"\n📈 Data Points: {results.get('data_points', 0)}") + + # Trend analysis + if "trend" in results: + trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "❓") + print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}") + if "rate_of_change" in results["trend"]: + print(f" Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval") + + # Anomaly detection + if "anomalies" in results: + anomaly_data = results["anomalies"] + if anomaly_data["anomalies_detected"]: + print(f"\n⚠️ ANOMALIES DETECTED: {anomaly_data['count']}") + print(f" Mean: {anomaly_data['stats']['mean']:.2f}") + print(f" Std Dev: {anomaly_data['stats']['stdev']:.2f}") + print(f" Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]") + + print("\n Top Anomalies:") + for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]: + print(f" • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)") + else: + print("\n✅ No anomalies detected") + + print("\n" + "="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze metrics from Prometheus or CloudWatch", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Prometheus: Analyze request rate + python3 analyze_metrics.py prometheus \\ + --endpoint http://localhost:9090 \\ + --query 'rate(http_requests_total[5m])' \\ + --hours 24 + + # CloudWatch: Analyze CPU utilization + python3 analyze_metrics.py cloudwatch \\ + --namespace AWS/EC2 \\ + --metric CPUUtilization \\ + --dimensions InstanceId=i-1234567890abcdef0 \\ + --hours 48 + """ + ) + + parser.add_argument('source', choices=['prometheus', 'cloudwatch'], + help='Metric source') + parser.add_argument('--endpoint', help='Prometheus endpoint URL') + parser.add_argument('--query', help='PromQL query') + parser.add_argument('--namespace', help='CloudWatch namespace') + parser.add_argument('--metric', help='CloudWatch metric name') + parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)') + parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)') + parser.add_argument('--sensitivity', type=float, default=2.0, + help='Anomaly detection sensitivity (std deviations, default: 2.0)') + parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)') + + args = parser.parse_args() + + analyzer = MetricAnalyzer(args.source, args.endpoint, args.region) + + # Query metrics + if args.source == 'prometheus': + if not args.query: + print("❌ --query required for Prometheus") + sys.exit(1) + + print(f"🔍 Querying Prometheus: {args.query}") + results = analyzer.query_prometheus(args.query, args.hours) + + if not results: + print("❌ No data returned") + sys.exit(1) + + # Extract values from first result series + values = [float(v[1]) for v in results[0].get('values', [])] + + elif args.source == 'cloudwatch': + if not all([args.namespace, args.metric, args.dimensions]): + print("❌ --namespace, --metric, and --dimensions required for CloudWatch") + sys.exit(1) + + dims = dict(item.split('=') for item in args.dimensions.split(',')) + + print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}") + results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours) + + if not results: + print("❌ No data returned") + sys.exit(1) + + values = [point['Average'] for point in results] + + # Analyze metrics + analysis_results = { + "data_points": len(values), + "trend": analyzer.analyze_trend(values), + "anomalies": analyzer.detect_anomalies(values, args.sensitivity) + } + + print_results(analysis_results) + + +if __name__ == "__main__": + main() diff --git a/scripts/dashboard_generator.py b/scripts/dashboard_generator.py new file mode 100644 index 0000000..a90d27e --- /dev/null +++ b/scripts/dashboard_generator.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +""" +Generate Grafana dashboards from templates. +Supports: web applications, Kubernetes, databases, Redis, and custom metrics. +""" + +import argparse +import sys +import json +from typing import Dict, List, Any, Optional +from pathlib import Path + + +class DashboardGenerator: + def __init__(self, title: str, datasource: str = "Prometheus"): + self.title = title + self.datasource = datasource + self.dashboard = self._create_base_dashboard() + self.panel_id = 1 + self.row_y = 0 + + def _create_base_dashboard(self) -> Dict[str, Any]: + """Create base dashboard structure.""" + return { + "dashboard": { + "title": self.title, + "tags": [], + "timezone": "browser", + "schemaVersion": 16, + "version": 0, + "refresh": "30s", + "panels": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + } + }, + "overwrite": True + } + + def add_variable(self, name: str, label: str, query: str): + """Add a template variable.""" + variable = { + "name": name, + "label": label, + "type": "query", + "datasource": self.datasource, + "query": query, + "refresh": 1, + "regex": "", + "multi": False, + "includeAll": False + } + self.dashboard["dashboard"]["templating"]["list"].append(variable) + + def add_row(self, title: str): + """Add a row panel.""" + panel = { + "id": self.panel_id, + "type": "row", + "title": title, + "collapsed": False, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": self.row_y} + } + self.dashboard["dashboard"]["panels"].append(panel) + self.panel_id += 1 + self.row_y += 1 + + def add_graph(self, title: str, targets: List[Dict[str, str]], unit: str = "short", + width: int = 12, height: int = 8): + """Add a graph panel.""" + panel = { + "id": self.panel_id, + "type": "graph", + "title": title, + "datasource": self.datasource, + "targets": [ + { + "expr": target["query"], + "legendFormat": target.get("legend", ""), + "refId": chr(65 + i) # A, B, C, etc. + } + for i, target in enumerate(targets) + ], + "gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y}, + "yaxes": [ + {"format": unit, "label": None, "show": True}, + {"format": "short", "label": None, "show": True} + ], + "lines": True, + "fill": 1, + "linewidth": 2, + "legend": { + "show": True, + "alignAsTable": True, + "avg": True, + "current": True, + "max": True, + "min": False, + "total": False, + "values": True + } + } + self.dashboard["dashboard"]["panels"].append(panel) + self.panel_id += 1 + self.row_y += height + + def add_stat(self, title: str, query: str, unit: str = "short", + width: int = 6, height: int = 4): + """Add a stat panel (single value).""" + panel = { + "id": self.panel_id, + "type": "stat", + "title": title, + "datasource": self.datasource, + "targets": [ + { + "expr": query, + "refId": "A" + } + ], + "gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y}, + "options": { + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "values": False, + "calcs": ["lastNotNull"] + } + }, + "fieldConfig": { + "defaults": { + "unit": unit, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": None, "color": "green"}, + {"value": 80, "color": "red"} + ] + } + } + } + } + self.dashboard["dashboard"]["panels"].append(panel) + self.panel_id += 1 + + def generate_webapp_dashboard(self, service: str): + """Generate dashboard for web application.""" + self.add_variable("service", "Service", f"label_values({service}_http_requests_total, service)") + + # Request metrics + self.add_row("Request Metrics") + + self.add_graph( + "Request Rate", + [{"query": f'sum(rate({service}_http_requests_total[5m])) by (status)', "legend": "{{status}}"}], + unit="reqps", + width=12 + ) + + self.add_graph( + "Request Latency (p50, p95, p99)", + [ + {"query": f'histogram_quantile(0.50, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p50"}, + {"query": f'histogram_quantile(0.95, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p95"}, + {"query": f'histogram_quantile(0.99, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p99"} + ], + unit="s", + width=12 + ) + + # Error rate + self.add_row("Errors") + + self.add_graph( + "Error Rate (%)", + [{"query": f'sum(rate({service}_http_requests_total{{status=~"5.."}}[5m])) / sum(rate({service}_http_requests_total[5m])) * 100', "legend": "Error Rate"}], + unit="percent", + width=12 + ) + + # Resource usage + self.add_row("Resource Usage") + + self.add_graph( + "CPU Usage", + [{"query": f'sum(rate(process_cpu_seconds_total{{job="{service}"}}[5m])) * 100', "legend": "CPU %"}], + unit="percent", + width=12 + ) + + self.add_graph( + "Memory Usage", + [{"query": f'process_resident_memory_bytes{{job="{service}"}}', "legend": "Memory"}], + unit="bytes", + width=12 + ) + + def generate_kubernetes_dashboard(self, namespace: str): + """Generate dashboard for Kubernetes cluster.""" + self.add_variable("namespace", "Namespace", f"label_values(kube_pod_info, namespace)") + + # Cluster overview + self.add_row("Cluster Overview") + + self.add_stat("Total Pods", f'count(kube_pod_info{{namespace="{namespace}"}})', width=6) + self.add_stat("Running Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Running"}})', width=6) + self.add_stat("Pending Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Pending"}})', width=6) + self.add_stat("Failed Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Failed"}})', width=6) + + # Resource usage + self.add_row("Resource Usage") + + self.add_graph( + "CPU Usage by Pod", + [{"query": f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "{{pod}}"}], + unit="percent", + width=12 + ) + + self.add_graph( + "Memory Usage by Pod", + [{"query": f'sum(container_memory_usage_bytes{{namespace="{namespace}"}}) by (pod)', "legend": "{{pod}}"}], + unit="bytes", + width=12 + ) + + # Network + self.add_row("Network") + + self.add_graph( + "Network I/O", + [ + {"query": f'sum(rate(container_network_receive_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Receive - {{pod}}"}, + {"query": f'sum(rate(container_network_transmit_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Transmit - {{pod}}"} + ], + unit="Bps", + width=12 + ) + + def generate_database_dashboard(self, db_type: str, instance: str): + """Generate dashboard for database (postgres/mysql).""" + if db_type == "postgres": + self._generate_postgres_dashboard(instance) + elif db_type == "mysql": + self._generate_mysql_dashboard(instance) + + def _generate_postgres_dashboard(self, instance: str): + """Generate PostgreSQL dashboard.""" + self.add_row("PostgreSQL Metrics") + + self.add_graph( + "Connections", + [ + {"query": f'pg_stat_database_numbackends{{instance="{instance}"}}', "legend": "{{datname}}"} + ], + unit="short", + width=12 + ) + + self.add_graph( + "Transactions per Second", + [ + {"query": f'rate(pg_stat_database_xact_commit{{instance="{instance}"}}[5m])', "legend": "Commits"}, + {"query": f'rate(pg_stat_database_xact_rollback{{instance="{instance}"}}[5m])', "legend": "Rollbacks"} + ], + unit="tps", + width=12 + ) + + self.add_graph( + "Query Duration (p95)", + [ + {"query": f'histogram_quantile(0.95, rate(pg_stat_statements_total_time_bucket{{instance="{instance}"}}[5m]))', "legend": "p95"} + ], + unit="ms", + width=12 + ) + + def _generate_mysql_dashboard(self, instance: str): + """Generate MySQL dashboard.""" + self.add_row("MySQL Metrics") + + self.add_graph( + "Connections", + [ + {"query": f'mysql_global_status_threads_connected{{instance="{instance}"}}', "legend": "Connected"}, + {"query": f'mysql_global_status_threads_running{{instance="{instance}"}}', "legend": "Running"} + ], + unit="short", + width=12 + ) + + self.add_graph( + "Queries per Second", + [ + {"query": f'rate(mysql_global_status_queries{{instance="{instance}"}}[5m])', "legend": "Queries"} + ], + unit="qps", + width=12 + ) + + def save(self, output_file: str): + """Save dashboard to file.""" + try: + with open(output_file, 'w') as f: + json.dump(self.dashboard, f, indent=2) + return True + except Exception as e: + print(f"❌ Error saving dashboard: {e}") + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Generate Grafana dashboards from templates", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Web application dashboard + python3 dashboard_generator.py webapp \\ + --title "My API Dashboard" \\ + --service my_api \\ + --output dashboard.json + + # Kubernetes dashboard + python3 dashboard_generator.py kubernetes \\ + --title "K8s Namespace" \\ + --namespace production \\ + --output k8s-dashboard.json + + # Database dashboard + python3 dashboard_generator.py database \\ + --title "PostgreSQL" \\ + --db-type postgres \\ + --instance db.example.com:5432 \\ + --output db-dashboard.json + """ + ) + + parser.add_argument('type', choices=['webapp', 'kubernetes', 'database'], + help='Dashboard type') + parser.add_argument('--title', required=True, help='Dashboard title') + parser.add_argument('--output', required=True, help='Output file path') + parser.add_argument('--datasource', default='Prometheus', help='Data source name') + + # Web app specific + parser.add_argument('--service', help='Service name (for webapp)') + + # Kubernetes specific + parser.add_argument('--namespace', help='Kubernetes namespace') + + # Database specific + parser.add_argument('--db-type', choices=['postgres', 'mysql'], help='Database type') + parser.add_argument('--instance', help='Database instance') + + args = parser.parse_args() + + print(f"🎨 Generating {args.type} dashboard: {args.title}") + + generator = DashboardGenerator(args.title, args.datasource) + + if args.type == 'webapp': + if not args.service: + print("❌ --service required for webapp dashboard") + sys.exit(1) + generator.generate_webapp_dashboard(args.service) + + elif args.type == 'kubernetes': + if not args.namespace: + print("❌ --namespace required for kubernetes dashboard") + sys.exit(1) + generator.generate_kubernetes_dashboard(args.namespace) + + elif args.type == 'database': + if not args.db_type or not args.instance: + print("❌ --db-type and --instance required for database dashboard") + sys.exit(1) + generator.generate_database_dashboard(args.db_type, args.instance) + + if generator.save(args.output): + print(f"✅ Dashboard saved to: {args.output}") + print(f"\n📝 Import to Grafana:") + print(f" 1. Go to Grafana → Dashboards → Import") + print(f" 2. Upload {args.output}") + print(f" 3. Select datasource and save") + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/datadog_cost_analyzer.py b/scripts/datadog_cost_analyzer.py new file mode 100644 index 0000000..a748266 --- /dev/null +++ b/scripts/datadog_cost_analyzer.py @@ -0,0 +1,477 @@ +#!/usr/bin/env python3 +""" +Analyze Datadog usage and identify cost optimization opportunities. +Helps find waste in custom metrics, logs, APM, and infrastructure monitoring. +""" + +import argparse +import sys +import os +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from collections import defaultdict + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +class DatadogCostAnalyzer: + # Pricing (as of 2024-2025) + PRICING = { + 'infrastructure_pro': 15, # per host per month + 'infrastructure_enterprise': 23, + 'custom_metric': 0.01, # per metric per month (first 100 free per host) + 'log_ingestion': 0.10, # per GB ingested per month + 'apm_host': 31, # APM Pro per host per month + 'apm_span': 1.70, # per million indexed spans + } + + def __init__(self, api_key: str, app_key: str, site: str = "datadoghq.com"): + self.api_key = api_key + self.app_key = app_key + self.site = site + self.base_url = f"https://api.{site}" + self.headers = { + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key, + 'Content-Type': 'application/json' + } + + def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict: + """Make API request to Datadog.""" + try: + url = f"{self.base_url}{endpoint}" + response = requests.get(url, headers=self.headers, params=params, timeout=30) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"❌ API Error: {e}") + return {} + + def get_usage_metrics(self, start_date: str, end_date: str) -> Dict[str, Any]: + """Get usage metrics for specified date range.""" + endpoint = "/api/v1/usage/summary" + params = { + 'start_month': start_date, + 'end_month': end_date, + 'include_org_details': 'true' + } + + data = self._make_request(endpoint, params) + return data.get('usage', []) + + def get_custom_metrics(self) -> Dict[str, Any]: + """Get custom metrics usage and identify high-cardinality metrics.""" + endpoint = "/api/v1/usage/timeseries" + + # Get last 30 days + end_date = datetime.now() + start_date = end_date - timedelta(days=30) + + params = { + 'start_hr': int(start_date.timestamp()), + 'end_hr': int(end_date.timestamp()) + } + + data = self._make_request(endpoint, params) + + if not data: + return {'metrics': [], 'total_count': 0} + + # Extract custom metrics info + usage_data = data.get('usage', []) + + metrics_summary = { + 'total_custom_metrics': 0, + 'avg_custom_metrics': 0, + 'billable_metrics': 0 + } + + for day in usage_data: + if 'timeseries' in day: + for ts in day['timeseries']: + if ts.get('metric_category') == 'custom': + metrics_summary['total_custom_metrics'] = max( + metrics_summary['total_custom_metrics'], + ts.get('num_custom_timeseries', 0) + ) + + # Calculate billable (first 100 free) + metrics_summary['billable_metrics'] = max(0, metrics_summary['total_custom_metrics'] - 100) + + return metrics_summary + + def get_infrastructure_hosts(self) -> Dict[str, Any]: + """Get infrastructure host count and breakdown.""" + endpoint = "/api/v1/usage/hosts" + + end_date = datetime.now() + start_date = end_date - timedelta(days=30) + + params = { + 'start_hr': int(start_date.timestamp()), + 'end_hr': int(end_date.timestamp()) + } + + data = self._make_request(endpoint, params) + + if not data: + return {'total_hosts': 0} + + usage = data.get('usage', []) + + host_summary = { + 'total_hosts': 0, + 'agent_hosts': 0, + 'aws_hosts': 0, + 'azure_hosts': 0, + 'gcp_hosts': 0, + 'container_count': 0 + } + + for day in usage: + host_summary['total_hosts'] = max(host_summary['total_hosts'], day.get('host_count', 0)) + host_summary['agent_hosts'] = max(host_summary['agent_hosts'], day.get('agent_host_count', 0)) + host_summary['aws_hosts'] = max(host_summary['aws_hosts'], day.get('aws_host_count', 0)) + host_summary['azure_hosts'] = max(host_summary['azure_hosts'], day.get('azure_host_count', 0)) + host_summary['gcp_hosts'] = max(host_summary['gcp_hosts'], day.get('gcp_host_count', 0)) + host_summary['container_count'] = max(host_summary['container_count'], day.get('container_count', 0)) + + return host_summary + + def get_log_usage(self) -> Dict[str, Any]: + """Get log ingestion and retention usage.""" + endpoint = "/api/v1/usage/logs" + + end_date = datetime.now() + start_date = end_date - timedelta(days=30) + + params = { + 'start_hr': int(start_date.timestamp()), + 'end_hr': int(end_date.timestamp()) + } + + data = self._make_request(endpoint, params) + + if not data: + return {'total_gb': 0, 'daily_avg_gb': 0} + + usage = data.get('usage', []) + + total_ingested = 0 + days_count = len(usage) + + for day in usage: + total_ingested += day.get('ingested_events_bytes', 0) + + total_gb = total_ingested / (1024**3) # Convert to GB + daily_avg_gb = total_gb / max(days_count, 1) + + return { + 'total_gb': total_gb, + 'daily_avg_gb': daily_avg_gb, + 'monthly_projected_gb': daily_avg_gb * 30 + } + + def get_unused_monitors(self) -> List[Dict[str, Any]]: + """Find monitors that haven't alerted in 30+ days.""" + endpoint = "/api/v1/monitor" + + data = self._make_request(endpoint) + + if not data: + return [] + + monitors = data if isinstance(data, list) else [] + + unused = [] + now = datetime.now() + + for monitor in monitors: + # Check if monitor has triggered recently + overall_state = monitor.get('overall_state') + modified = monitor.get('modified', '') + + # If monitor has been in OK state and not modified in 30+ days + try: + if modified: + mod_date = datetime.fromisoformat(modified.replace('Z', '+00:00')) + days_since_modified = (now - mod_date.replace(tzinfo=None)).days + + if days_since_modified > 30 and overall_state in ['OK', 'No Data']: + unused.append({ + 'name': monitor.get('name', 'Unknown'), + 'id': monitor.get('id'), + 'days_since_modified': days_since_modified, + 'state': overall_state + }) + except: + pass + + return unused + + def calculate_costs(self, usage_data: Dict[str, Any]) -> Dict[str, float]: + """Calculate estimated monthly costs.""" + costs = { + 'infrastructure': 0, + 'custom_metrics': 0, + 'logs': 0, + 'apm': 0, + 'total': 0 + } + + # Infrastructure (assuming Pro tier) + if 'hosts' in usage_data: + costs['infrastructure'] = usage_data['hosts'].get('total_hosts', 0) * self.PRICING['infrastructure_pro'] + + # Custom metrics + if 'custom_metrics' in usage_data: + billable = usage_data['custom_metrics'].get('billable_metrics', 0) + costs['custom_metrics'] = billable * self.PRICING['custom_metric'] + + # Logs + if 'logs' in usage_data: + monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0) + costs['logs'] = monthly_gb * self.PRICING['log_ingestion'] + + costs['total'] = sum(costs.values()) + + return costs + + def get_recommendations(self, usage_data: Dict[str, Any]) -> List[str]: + """Generate cost optimization recommendations.""" + recommendations = [] + + # Custom metrics recommendations + if 'custom_metrics' in usage_data: + billable = usage_data['custom_metrics'].get('billable_metrics', 0) + if billable > 500: + savings = (billable * 0.3) * self.PRICING['custom_metric'] # Assume 30% reduction possible + recommendations.append({ + 'category': 'Custom Metrics', + 'issue': f'High custom metric count: {billable:,} billable metrics', + 'action': 'Review metric tags for high cardinality, consider aggregating or dropping unused metrics', + 'potential_savings': f'${savings:.2f}/month' + }) + + # Container vs VM recommendations + if 'hosts' in usage_data: + hosts = usage_data['hosts'].get('total_hosts', 0) + containers = usage_data['hosts'].get('container_count', 0) + + if containers > hosts * 10: # Many containers per host + savings = hosts * 0.2 * self.PRICING['infrastructure_pro'] + recommendations.append({ + 'category': 'Infrastructure', + 'issue': f'{containers:,} containers running on {hosts} hosts', + 'action': 'Consider using container monitoring instead of host-based (can be 50-70% cheaper)', + 'potential_savings': f'${savings:.2f}/month' + }) + + # Unused monitors + if 'unused_monitors' in usage_data: + count = len(usage_data['unused_monitors']) + if count > 10: + recommendations.append({ + 'category': 'Monitors', + 'issue': f'{count} monitors unused for 30+ days', + 'action': 'Delete or disable unused monitors to reduce noise and improve performance', + 'potential_savings': 'Operational efficiency' + }) + + # Log volume recommendations + if 'logs' in usage_data: + monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0) + if monthly_gb > 100: + savings = (monthly_gb * 0.4) * self.PRICING['log_ingestion'] # 40% reduction + recommendations.append({ + 'category': 'Logs', + 'issue': f'High log volume: {monthly_gb:.1f} GB/month projected', + 'action': 'Review log sources, implement sampling for debug logs, exclude health checks', + 'potential_savings': f'${savings:.2f}/month' + }) + + # Migration recommendation if costs are high + costs = self.calculate_costs(usage_data) + if costs['total'] > 5000: + oss_cost = usage_data['hosts'].get('total_hosts', 0) * 15 # Rough estimate for self-hosted + savings = costs['total'] - oss_cost + recommendations.append({ + 'category': 'Strategic', + 'issue': f'Total monthly cost: ${costs["total"]:.2f}', + 'action': 'Consider migrating to open-source stack (Prometheus + Grafana + Loki)', + 'potential_savings': f'${savings:.2f}/month (~{(savings/costs["total"]*100):.0f}% reduction)' + }) + + return recommendations + + +def print_usage_summary(usage_data: Dict[str, Any]): + """Print usage summary.""" + print("\n" + "="*70) + print("📊 DATADOG USAGE SUMMARY") + print("="*70) + + # Infrastructure + if 'hosts' in usage_data: + hosts = usage_data['hosts'] + print(f"\n🖥️ Infrastructure:") + print(f" Total Hosts: {hosts.get('total_hosts', 0):,}") + print(f" Agent Hosts: {hosts.get('agent_hosts', 0):,}") + print(f" AWS Hosts: {hosts.get('aws_hosts', 0):,}") + print(f" Azure Hosts: {hosts.get('azure_hosts', 0):,}") + print(f" GCP Hosts: {hosts.get('gcp_hosts', 0):,}") + print(f" Containers: {hosts.get('container_count', 0):,}") + + # Custom Metrics + if 'custom_metrics' in usage_data: + metrics = usage_data['custom_metrics'] + print(f"\n📈 Custom Metrics:") + print(f" Total: {metrics.get('total_custom_metrics', 0):,}") + print(f" Billable: {metrics.get('billable_metrics', 0):,} (first 100 free)") + + # Logs + if 'logs' in usage_data: + logs = usage_data['logs'] + print(f"\n📝 Logs:") + print(f" Daily Average: {logs.get('daily_avg_gb', 0):.2f} GB") + print(f" Monthly Projected: {logs.get('monthly_projected_gb', 0):.2f} GB") + + # Unused Monitors + if 'unused_monitors' in usage_data: + print(f"\n🔔 Unused Monitors:") + print(f" Count: {len(usage_data['unused_monitors'])}") + + +def print_cost_breakdown(costs: Dict[str, float]): + """Print cost breakdown.""" + print("\n" + "="*70) + print("💰 ESTIMATED MONTHLY COSTS") + print("="*70) + + print(f"\n Infrastructure Monitoring: ${costs['infrastructure']:,.2f}") + print(f" Custom Metrics: ${costs['custom_metrics']:,.2f}") + print(f" Log Management: ${costs['logs']:,.2f}") + print(f" APM: ${costs['apm']:,.2f}") + print(f" " + "-"*40) + print(f" TOTAL: ${costs['total']:,.2f}/month") + print(f" ${costs['total']*12:,.2f}/year") + + +def print_recommendations(recommendations: List[Dict]): + """Print recommendations.""" + print("\n" + "="*70) + print("💡 COST OPTIMIZATION RECOMMENDATIONS") + print("="*70) + + total_savings = 0 + + for i, rec in enumerate(recommendations, 1): + print(f"\n{i}. {rec['category']}") + print(f" Issue: {rec['issue']}") + print(f" Action: {rec['action']}") + print(f" Potential Savings: {rec['potential_savings']}") + + # Extract savings amount if it's a dollar value + if '$' in rec['potential_savings']: + try: + amount = float(rec['potential_savings'].replace('$', '').replace('/month', '').replace(',', '')) + total_savings += amount + except: + pass + + if total_savings > 0: + print(f"\n{'='*70}") + print(f"💵 Total Potential Monthly Savings: ${total_savings:,.2f}") + print(f"💵 Total Potential Annual Savings: ${total_savings*12:,.2f}") + print(f"{'='*70}") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze Datadog usage and identify cost optimization opportunities", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Analyze current usage + python3 datadog_cost_analyzer.py \\ + --api-key DD_API_KEY \\ + --app-key DD_APP_KEY + + # Use environment variables + export DD_API_KEY=your_api_key + export DD_APP_KEY=your_app_key + python3 datadog_cost_analyzer.py + + # Specify site (for EU) + python3 datadog_cost_analyzer.py --site datadoghq.eu + +Required Datadog Permissions: + - usage_read + - monitors_read + """ + ) + + parser.add_argument('--api-key', + default=os.environ.get('DD_API_KEY'), + help='Datadog API key (or set DD_API_KEY env var)') + parser.add_argument('--app-key', + default=os.environ.get('DD_APP_KEY'), + help='Datadog Application key (or set DD_APP_KEY env var)') + parser.add_argument('--site', + default='datadoghq.com', + help='Datadog site (default: datadoghq.com, EU: datadoghq.eu)') + + args = parser.parse_args() + + if not args.api_key or not args.app_key: + print("❌ Error: API key and Application key required") + print(" Set via --api-key and --app-key flags or DD_API_KEY and DD_APP_KEY env vars") + sys.exit(1) + + print("🔍 Analyzing Datadog usage...") + print(" This may take 30-60 seconds...\n") + + analyzer = DatadogCostAnalyzer(args.api_key, args.app_key, args.site) + + # Gather usage data + usage_data = {} + + print(" ⏳ Fetching infrastructure usage...") + usage_data['hosts'] = analyzer.get_infrastructure_hosts() + + print(" ⏳ Fetching custom metrics...") + usage_data['custom_metrics'] = analyzer.get_custom_metrics() + + print(" ⏳ Fetching log usage...") + usage_data['logs'] = analyzer.get_log_usage() + + print(" ⏳ Finding unused monitors...") + usage_data['unused_monitors'] = analyzer.get_unused_monitors() + + # Calculate costs + costs = analyzer.calculate_costs(usage_data) + + # Generate recommendations + recommendations = analyzer.get_recommendations(usage_data) + + # Print results + print_usage_summary(usage_data) + print_cost_breakdown(costs) + print_recommendations(recommendations) + + print("\n" + "="*70) + print("✅ Analysis complete!") + print("="*70) + + +if __name__ == "__main__": + main() diff --git a/scripts/health_check_validator.py b/scripts/health_check_validator.py new file mode 100644 index 0000000..1be4fc9 --- /dev/null +++ b/scripts/health_check_validator.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +""" +Validate health check endpoints and analyze response quality. +Checks: response time, status code, response format, dependencies. +""" + +import argparse +import sys +import time +import json +from typing import Dict, List, Any, Optional +from urllib.parse import urlparse + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + + +class HealthCheckValidator: + def __init__(self, timeout: int = 5): + self.timeout = timeout + self.results = [] + + def validate_endpoint(self, url: str) -> Dict[str, Any]: + """Validate a health check endpoint.""" + result = { + "url": url, + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "checks": [], + "warnings": [], + "errors": [] + } + + try: + # Make request + start_time = time.time() + response = requests.get(url, timeout=self.timeout, verify=True) + response_time = time.time() - start_time + + result["status_code"] = response.status_code + result["response_time"] = response_time + + # Check 1: Status code + if response.status_code == 200: + result["checks"].append("✅ Status code is 200") + else: + result["errors"].append(f"❌ Unexpected status code: {response.status_code} (expected 200)") + + # Check 2: Response time + if response_time < 1.0: + result["checks"].append(f"✅ Response time: {response_time:.3f}s (< 1s)") + elif response_time < 3.0: + result["warnings"].append(f"⚠️ Slow response time: {response_time:.3f}s (should be < 1s)") + else: + result["errors"].append(f"❌ Very slow response time: {response_time:.3f}s (should be < 1s)") + + # Check 3: Content type + content_type = response.headers.get('Content-Type', '') + if 'application/json' in content_type: + result["checks"].append("✅ Content-Type is application/json") + + # Try to parse JSON + try: + data = response.json() + result["response_data"] = data + + # Check for common health check fields + self._validate_json_structure(data, result) + + except json.JSONDecodeError: + result["errors"].append("❌ Invalid JSON response") + elif 'text/plain' in content_type: + result["warnings"].append("⚠️ Content-Type is text/plain (JSON recommended)") + result["response_data"] = response.text + else: + result["warnings"].append(f"⚠️ Unexpected Content-Type: {content_type}") + + # Check 4: Response headers + self._validate_headers(response.headers, result) + + except requests.exceptions.Timeout: + result["errors"].append(f"❌ Request timeout (> {self.timeout}s)") + result["status_code"] = None + result["response_time"] = None + + except requests.exceptions.ConnectionError: + result["errors"].append("❌ Connection error (endpoint unreachable)") + result["status_code"] = None + result["response_time"] = None + + except requests.exceptions.SSLError: + result["errors"].append("❌ SSL certificate validation failed") + result["status_code"] = None + result["response_time"] = None + + except Exception as e: + result["errors"].append(f"❌ Unexpected error: {str(e)}") + result["status_code"] = None + result["response_time"] = None + + # Overall status + if result["errors"]: + result["overall_status"] = "UNHEALTHY" + elif result["warnings"]: + result["overall_status"] = "DEGRADED" + else: + result["overall_status"] = "HEALTHY" + + return result + + def _validate_json_structure(self, data: Dict[str, Any], result: Dict[str, Any]): + """Validate JSON health check structure.""" + # Check for status field + if "status" in data: + status = data["status"] + if status in ["ok", "healthy", "up", "pass"]: + result["checks"].append(f"✅ Status field present: '{status}'") + else: + result["warnings"].append(f"⚠️ Status field has unexpected value: '{status}'") + else: + result["warnings"].append("⚠️ Missing 'status' field (recommended)") + + # Check for version/build info + if any(key in data for key in ["version", "build", "commit", "timestamp"]): + result["checks"].append("✅ Version/build information present") + else: + result["warnings"].append("⚠️ No version/build information (recommended)") + + # Check for dependencies + if "dependencies" in data or "checks" in data or "components" in data: + result["checks"].append("✅ Dependency checks present") + + # Validate dependency structure + deps = data.get("dependencies") or data.get("checks") or data.get("components") + if isinstance(deps, dict): + unhealthy_deps = [] + for name, info in deps.items(): + if isinstance(info, dict): + dep_status = info.get("status", "unknown") + if dep_status not in ["ok", "healthy", "up", "pass"]: + unhealthy_deps.append(name) + elif isinstance(info, str): + if info not in ["ok", "healthy", "up", "pass"]: + unhealthy_deps.append(name) + + if unhealthy_deps: + result["warnings"].append(f"⚠️ Unhealthy dependencies: {', '.join(unhealthy_deps)}") + else: + result["checks"].append(f"✅ All dependencies healthy ({len(deps)} checked)") + else: + result["warnings"].append("⚠️ No dependency checks (recommended for production services)") + + # Check for uptime/metrics + if any(key in data for key in ["uptime", "metrics", "stats"]): + result["checks"].append("✅ Metrics/stats present") + + def _validate_headers(self, headers: Dict[str, str], result: Dict[str, Any]): + """Validate response headers.""" + # Check for caching headers + cache_control = headers.get('Cache-Control', '') + if 'no-cache' in cache_control or 'no-store' in cache_control: + result["checks"].append("✅ Caching disabled (Cache-Control: no-cache)") + else: + result["warnings"].append("⚠️ Caching not explicitly disabled (add Cache-Control: no-cache)") + + def validate_multiple(self, urls: List[str]) -> List[Dict[str, Any]]: + """Validate multiple health check endpoints.""" + results = [] + for url in urls: + print(f"🔍 Checking: {url}") + result = self.validate_endpoint(url) + results.append(result) + return results + + +def print_result(result: Dict[str, Any], verbose: bool = False): + """Print validation result.""" + status_emoji = { + "HEALTHY": "✅", + "DEGRADED": "⚠️", + "UNHEALTHY": "❌" + } + + print("\n" + "="*60) + emoji = status_emoji.get(result["overall_status"], "❓") + print(f"{emoji} {result['overall_status']}: {result['url']}") + print("="*60) + + if result.get("status_code"): + print(f"\n📊 Status Code: {result['status_code']}") + print(f"⏱️ Response Time: {result['response_time']:.3f}s") + + # Print checks + if result["checks"]: + print(f"\n✅ Passed Checks:") + for check in result["checks"]: + print(f" {check}") + + # Print warnings + if result["warnings"]: + print(f"\n⚠️ Warnings:") + for warning in result["warnings"]: + print(f" {warning}") + + # Print errors + if result["errors"]: + print(f"\n❌ Errors:") + for error in result["errors"]: + print(f" {error}") + + # Print response data if verbose + if verbose and "response_data" in result: + print(f"\n📄 Response Data:") + if isinstance(result["response_data"], dict): + print(json.dumps(result["response_data"], indent=2)) + else: + print(result["response_data"]) + + print("="*60) + + +def print_summary(results: List[Dict[str, Any]]): + """Print summary of multiple validations.""" + print("\n" + "="*60) + print("📊 HEALTH CHECK VALIDATION SUMMARY") + print("="*60) + + healthy = sum(1 for r in results if r["overall_status"] == "HEALTHY") + degraded = sum(1 for r in results if r["overall_status"] == "DEGRADED") + unhealthy = sum(1 for r in results if r["overall_status"] == "UNHEALTHY") + + print(f"\n✅ Healthy: {healthy}/{len(results)}") + print(f"⚠️ Degraded: {degraded}/{len(results)}") + print(f"❌ Unhealthy: {unhealthy}/{len(results)}") + + if results: + avg_response_time = sum(r.get("response_time", 0) for r in results if r.get("response_time")) / len(results) + print(f"\n⏱️ Average Response Time: {avg_response_time:.3f}s") + + print("="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Validate health check endpoints", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check a single endpoint + python3 health_check_validator.py https://api.example.com/health + + # Check multiple endpoints + python3 health_check_validator.py \\ + https://api.example.com/health \\ + https://api.example.com/readiness + + # Verbose output with response data + python3 health_check_validator.py https://api.example.com/health --verbose + + # Custom timeout + python3 health_check_validator.py https://api.example.com/health --timeout 10 + +Best Practices Checked: + ✓ Returns 200 status code + ✓ Response time < 1 second + ✓ Returns JSON format + ✓ Contains 'status' field + ✓ Includes version/build info + ✓ Checks dependencies + ✓ Includes metrics + ✓ Disables caching + """ + ) + + parser.add_argument('urls', nargs='+', help='Health check endpoint URL(s)') + parser.add_argument('--timeout', type=int, default=5, help='Request timeout in seconds (default: 5)') + parser.add_argument('--verbose', action='store_true', help='Show detailed response data') + + args = parser.parse_args() + + validator = HealthCheckValidator(timeout=args.timeout) + + results = validator.validate_multiple(args.urls) + + # Print individual results + for result in results: + print_result(result, args.verbose) + + # Print summary if multiple endpoints + if len(results) > 1: + print_summary(results) + + +if __name__ == "__main__": + main() diff --git a/scripts/log_analyzer.py b/scripts/log_analyzer.py new file mode 100644 index 0000000..a4f7803 --- /dev/null +++ b/scripts/log_analyzer.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +""" +Parse and analyze logs for patterns, errors, and anomalies. +Supports: error detection, frequency analysis, pattern matching. +""" + +import argparse +import sys +import re +import json +from collections import Counter, defaultdict +from datetime import datetime +from typing import Dict, List, Any, Optional +from pathlib import Path + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +class LogAnalyzer: + # Common log level patterns + LOG_LEVELS = { + 'ERROR': r'\b(ERROR|Error|error)\b', + 'WARN': r'\b(WARN|Warning|warn|warning)\b', + 'INFO': r'\b(INFO|Info|info)\b', + 'DEBUG': r'\b(DEBUG|Debug|debug)\b', + 'FATAL': r'\b(FATAL|Fatal|fatal|CRITICAL|Critical)\b' + } + + # Common error patterns + ERROR_PATTERNS = { + 'exception': r'Exception|exception|EXCEPTION', + 'stack_trace': r'\s+at\s+.*\(.*:\d+\)', + 'http_error': r'\b[45]\d{2}\b', # 4xx and 5xx HTTP codes + 'timeout': r'timeout|timed out|TIMEOUT', + 'connection_refused': r'connection refused|ECONNREFUSED', + 'out_of_memory': r'OutOfMemoryError|OOM|out of memory', + 'null_pointer': r'NullPointerException|null pointer|NPE', + 'database_error': r'SQLException|database error|DB error' + } + + def __init__(self, log_file: str): + self.log_file = log_file + self.lines = [] + self.log_levels = Counter() + self.error_patterns = Counter() + self.timestamps = [] + + def parse_file(self) -> bool: + """Parse log file.""" + try: + with open(self.log_file, 'r', encoding='utf-8', errors='ignore') as f: + self.lines = f.readlines() + return True + except Exception as e: + print(f"❌ Error reading file: {e}") + return False + + def analyze_log_levels(self): + """Count log levels.""" + for line in self.lines: + for level, pattern in self.LOG_LEVELS.items(): + if re.search(pattern, line): + self.log_levels[level] += 1 + break # Count each line only once + + def analyze_error_patterns(self): + """Detect common error patterns.""" + for line in self.lines: + for pattern_name, pattern in self.ERROR_PATTERNS.items(): + if re.search(pattern, line, re.IGNORECASE): + self.error_patterns[pattern_name] += 1 + + def extract_timestamps(self, timestamp_pattern: Optional[str] = None): + """Extract timestamps from logs.""" + if not timestamp_pattern: + # Common timestamp patterns + patterns = [ + r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}', # ISO format + r'\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}', # Apache format + r'\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', # Syslog format + ] + else: + patterns = [timestamp_pattern] + + for line in self.lines: + for pattern in patterns: + match = re.search(pattern, line) + if match: + self.timestamps.append(match.group()) + break + + def find_error_lines(self, context: int = 2) -> List[Dict[str, Any]]: + """Find error lines with context.""" + errors = [] + + for i, line in enumerate(self.lines): + # Check if line contains error keywords + is_error = any(re.search(pattern, line, re.IGNORECASE) + for pattern in [self.LOG_LEVELS['ERROR'], self.LOG_LEVELS['FATAL']]) + + if is_error: + # Get context lines + start = max(0, i - context) + end = min(len(self.lines), i + context + 1) + context_lines = self.lines[start:end] + + errors.append({ + 'line_number': i + 1, + 'line': line.strip(), + 'context': ''.join(context_lines) + }) + + return errors + + def analyze_frequency(self, time_window_minutes: int = 5) -> Dict[str, Any]: + """Analyze log frequency over time.""" + if not self.timestamps: + return {"error": "No timestamps found"} + + # This is a simplified version - in production you'd parse actual timestamps + total_lines = len(self.lines) + if self.timestamps: + time_span = len(self.timestamps) + avg_per_window = total_lines / max(1, time_span / time_window_minutes) + else: + avg_per_window = 0 + + return { + "total_lines": total_lines, + "timestamps_found": len(self.timestamps), + "avg_per_window": avg_per_window + } + + def extract_unique_messages(self, pattern: str) -> List[str]: + """Extract unique messages matching a pattern.""" + matches = [] + seen = set() + + for line in self.lines: + match = re.search(pattern, line, re.IGNORECASE) + if match: + msg = match.group() if match.lastindex is None else match.group(1) + if msg not in seen: + matches.append(msg) + seen.add(msg) + + return matches + + def find_stack_traces(self) -> List[Dict[str, Any]]: + """Extract complete stack traces.""" + stack_traces = [] + current_trace = [] + in_trace = False + + for i, line in enumerate(self.lines): + # Start of stack trace + if re.search(r'Exception|Error.*:', line): + if current_trace: + stack_traces.append({ + 'line_start': i - len(current_trace) + 1, + 'trace': '\n'.join(current_trace) + }) + current_trace = [line.strip()] + in_trace = True + # Stack trace continuation + elif in_trace and re.search(r'^\s+at\s+', line): + current_trace.append(line.strip()) + # End of stack trace + elif in_trace: + if current_trace: + stack_traces.append({ + 'line_start': i - len(current_trace) + 1, + 'trace': '\n'.join(current_trace) + }) + current_trace = [] + in_trace = False + + # Add last trace if exists + if current_trace: + stack_traces.append({ + 'line_start': len(self.lines) - len(current_trace) + 1, + 'trace': '\n'.join(current_trace) + }) + + return stack_traces + + +def print_analysis_results(analyzer: LogAnalyzer, show_errors: bool = False, + show_traces: bool = False): + """Print analysis results.""" + print("\n" + "="*60) + print("📝 LOG ANALYSIS RESULTS") + print("="*60) + + print(f"\n📁 File: {analyzer.log_file}") + print(f"📊 Total Lines: {len(analyzer.lines):,}") + + # Log levels + if analyzer.log_levels: + print(f"\n{'='*60}") + print("📊 LOG LEVEL DISTRIBUTION:") + print(f"{'='*60}") + + level_emoji = { + 'FATAL': '🔴', + 'ERROR': '❌', + 'WARN': '⚠️', + 'INFO': 'ℹ️', + 'DEBUG': '🐛' + } + + for level, count in analyzer.log_levels.most_common(): + emoji = level_emoji.get(level, '•') + percentage = (count / len(analyzer.lines)) * 100 + print(f"{emoji} {level:10s}: {count:6,} ({percentage:5.1f}%)") + + # Error patterns + if analyzer.error_patterns: + print(f"\n{'='*60}") + print("🔍 ERROR PATTERNS DETECTED:") + print(f"{'='*60}") + + for pattern, count in analyzer.error_patterns.most_common(10): + print(f"• {pattern:20s}: {count:,} occurrences") + + # Timestamps + if analyzer.timestamps: + print(f"\n{'='*60}") + print(f"⏰ Timestamps Found: {len(analyzer.timestamps):,}") + print(f" First: {analyzer.timestamps[0]}") + print(f" Last: {analyzer.timestamps[-1]}") + + # Error lines + if show_errors: + errors = analyzer.find_error_lines(context=1) + if errors: + print(f"\n{'='*60}") + print(f"❌ ERROR LINES (showing first 10 of {len(errors)}):") + print(f"{'='*60}") + + for error in errors[:10]: + print(f"\nLine {error['line_number']}:") + print(f" {error['line']}") + + # Stack traces + if show_traces: + traces = analyzer.find_stack_traces() + if traces: + print(f"\n{'='*60}") + print(f"📚 STACK TRACES FOUND: {len(traces)}") + print(f"{'='*60}") + + for i, trace in enumerate(traces[:5], 1): + print(f"\nTrace {i} (starting at line {trace['line_start']}):") + print(trace['trace']) + if i < len(traces): + print("\n" + "-"*60) + + print("\n" + "="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze log files for errors, patterns, and anomalies", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic analysis + python3 log_analyzer.py application.log + + # Show error lines with context + python3 log_analyzer.py application.log --show-errors + + # Show stack traces + python3 log_analyzer.py application.log --show-traces + + # Full analysis + python3 log_analyzer.py application.log --show-errors --show-traces + +Features: + • Log level distribution (ERROR, WARN, INFO, DEBUG, FATAL) + • Common error pattern detection + • Timestamp extraction + • Error line identification with context + • Stack trace extraction + • Frequency analysis + """ + ) + + parser.add_argument('log_file', help='Path to log file') + parser.add_argument('--show-errors', action='store_true', help='Show error lines') + parser.add_argument('--show-traces', action='store_true', help='Show stack traces') + parser.add_argument('--timestamp-pattern', help='Custom regex for timestamp extraction') + + args = parser.parse_args() + + if not Path(args.log_file).exists(): + print(f"❌ File not found: {args.log_file}") + sys.exit(1) + + print(f"🔍 Analyzing log file: {args.log_file}") + + analyzer = LogAnalyzer(args.log_file) + + if not analyzer.parse_file(): + sys.exit(1) + + # Perform analysis + analyzer.analyze_log_levels() + analyzer.analyze_error_patterns() + analyzer.extract_timestamps(args.timestamp_pattern) + + # Print results + print_analysis_results(analyzer, args.show_errors, args.show_traces) + + +if __name__ == "__main__": + main() diff --git a/scripts/slo_calculator.py b/scripts/slo_calculator.py new file mode 100644 index 0000000..78c38bb --- /dev/null +++ b/scripts/slo_calculator.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +""" +Calculate SLO compliance, error budgets, and burn rates. +Supports availability SLOs and latency SLOs. +""" + +import argparse +import sys +from datetime import datetime, timedelta +from typing import Dict, Any, Optional + +try: + from tabulate import tabulate +except ImportError: + print("⚠️ Warning: 'tabulate' library not found. Install with: pip install tabulate") + tabulate = None + + +class SLOCalculator: + # SLO targets and allowed downtime per period + SLO_TARGETS = { + "90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1}, # days + "95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05}, + "99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01}, + "99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005}, + "99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001}, + "99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005}, + "99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001}, + } + + def __init__(self, slo_target: float, period_days: int = 30): + """ + Initialize SLO calculator. + + Args: + slo_target: SLO target percentage (e.g., 99.9) + period_days: Time period in days (default: 30) + """ + self.slo_target = slo_target + self.period_days = period_days + self.error_budget_minutes = self.calculate_error_budget_minutes() + + def calculate_error_budget_minutes(self) -> float: + """Calculate error budget in minutes for the period.""" + total_minutes = self.period_days * 24 * 60 + allowed_error_rate = (100 - self.slo_target) / 100 + return total_minutes * allowed_error_rate + + def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]: + """ + Calculate availability SLO compliance. + + Args: + total_requests: Total number of requests + failed_requests: Number of failed requests + + Returns: + Dict with SLO compliance metrics + """ + if total_requests == 0: + return { + "error": "No requests in the period", + "slo_met": False + } + + success_rate = ((total_requests - failed_requests) / total_requests) * 100 + error_rate = (failed_requests / total_requests) * 100 + + # Calculate error budget consumption + allowed_failures = total_requests * ((100 - self.slo_target) / 100) + error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf') + error_budget_remaining = max(0, 100 - error_budget_consumed) + + # Determine if SLO is met + slo_met = success_rate >= self.slo_target + + return { + "slo_target": self.slo_target, + "period_days": self.period_days, + "total_requests": total_requests, + "failed_requests": failed_requests, + "success_requests": total_requests - failed_requests, + "success_rate": success_rate, + "error_rate": error_rate, + "slo_met": slo_met, + "error_budget_total": allowed_failures, + "error_budget_consumed": error_budget_consumed, + "error_budget_remaining": error_budget_remaining, + "margin": success_rate - self.slo_target + } + + def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]: + """ + Calculate latency SLO compliance. + + Args: + total_requests: Total number of requests + requests_exceeding_threshold: Number of requests exceeding latency threshold + + Returns: + Dict with SLO compliance metrics + """ + if total_requests == 0: + return { + "error": "No requests in the period", + "slo_met": False + } + + within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100 + + # Calculate error budget consumption + allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100) + error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf') + error_budget_remaining = max(0, 100 - error_budget_consumed) + + slo_met = within_threshold_rate >= self.slo_target + + return { + "slo_target": self.slo_target, + "period_days": self.period_days, + "total_requests": total_requests, + "requests_exceeding_threshold": requests_exceeding_threshold, + "requests_within_threshold": total_requests - requests_exceeding_threshold, + "within_threshold_rate": within_threshold_rate, + "slo_met": slo_met, + "error_budget_total": allowed_slow_requests, + "error_budget_consumed": error_budget_consumed, + "error_budget_remaining": error_budget_remaining, + "margin": within_threshold_rate - self.slo_target + } + + def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]: + """ + Calculate error budget burn rate. + + Args: + errors_in_window: Number of errors in the time window + requests_in_window: Total requests in the time window + window_hours: Size of the time window in hours + + Returns: + Dict with burn rate metrics + """ + if requests_in_window == 0: + return {"error": "No requests in window"} + + # Calculate actual error rate in this window + actual_error_rate = (errors_in_window / requests_in_window) * 100 + + # Calculate allowed error rate for SLO + allowed_error_rate = 100 - self.slo_target + + # Burn rate = actual error rate / allowed error rate + burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf') + + # Calculate time to exhaustion + if burn_rate > 0: + error_budget_hours = self.error_budget_minutes / 60 + hours_to_exhaustion = error_budget_hours / burn_rate + else: + hours_to_exhaustion = float('inf') + + # Determine severity + if burn_rate >= 14.4: # 1 hour window, burns budget in 2 days + severity = "critical" + elif burn_rate >= 6: # 6 hour window, burns budget in 5 days + severity = "warning" + elif burn_rate >= 1: + severity = "elevated" + else: + severity = "normal" + + return { + "window_hours": window_hours, + "requests_in_window": requests_in_window, + "errors_in_window": errors_in_window, + "actual_error_rate": actual_error_rate, + "allowed_error_rate": allowed_error_rate, + "burn_rate": burn_rate, + "hours_to_exhaustion": hours_to_exhaustion, + "severity": severity + } + + @staticmethod + def print_slo_table(): + """Print table of common SLO targets and allowed downtime.""" + if not tabulate: + print("Install tabulate for formatted output: pip install tabulate") + return + + print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME") + print("="*60) + + headers = ["SLO", "Year", "Month", "Week", "Day"] + rows = [] + + for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True): + row = [ + f"{slo}%", + f"{downtimes['year']:.2f} days", + f"{downtimes['month']:.2f} days", + f"{downtimes['week']:.2f} days", + f"{downtimes['day']:.2f} days" + ] + rows.append(row) + + print(tabulate(rows, headers=headers, tablefmt="grid")) + + +def print_availability_results(results: Dict[str, Any]): + """Print availability SLO results.""" + print("\n" + "="*60) + print("📊 AVAILABILITY SLO COMPLIANCE") + print("="*60) + + if "error" in results: + print(f"\n❌ Error: {results['error']}") + return + + status_emoji = "✅" if results['slo_met'] else "❌" + print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}") + print(f" Target: {results['slo_target']}%") + print(f" Actual: {results['success_rate']:.3f}%") + print(f" Margin: {results['margin']:+.3f}%") + + print(f"\n📈 Request Statistics:") + print(f" Total Requests: {results['total_requests']:,}") + print(f" Successful: {results['success_requests']:,}") + print(f" Failed: {results['failed_requests']:,}") + print(f" Error Rate: {results['error_rate']:.3f}%") + + print(f"\n💰 Error Budget:") + budget_emoji = "✅" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else "❌" + print(f" {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%") + print(f" Consumed: {results['error_budget_consumed']:.1f}%") + print(f" Allowed Failures: {results['error_budget_total']:.0f}") + + print("\n" + "="*60) + + +def print_burn_rate_results(results: Dict[str, Any]): + """Print burn rate results.""" + print("\n" + "="*60) + print("🔥 ERROR BUDGET BURN RATE") + print("="*60) + + if "error" in results: + print(f"\n❌ Error: {results['error']}") + return + + severity_emoji = { + "critical": "🔴", + "warning": "🟡", + "elevated": "🟠", + "normal": "🟢" + } + + print(f"\n{severity_emoji.get(results['severity'], '❓')} Severity: {results['severity'].upper()}") + print(f" Burn Rate: {results['burn_rate']:.2f}x") + print(f" Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)") + + print(f"\n📊 Window Statistics:") + print(f" Window: {results['window_hours']} hours") + print(f" Requests: {results['requests_in_window']:,}") + print(f" Errors: {results['errors_in_window']:,}") + print(f" Actual Error Rate: {results['actual_error_rate']:.3f}%") + print(f" Allowed Error Rate: {results['allowed_error_rate']:.3f}%") + + print("\n" + "="*60) + + +def main(): + parser = argparse.ArgumentParser( + description="Calculate SLO compliance and error budgets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Show SLO reference table + python3 slo_calculator.py --table + + # Calculate availability SLO + python3 slo_calculator.py availability \\ + --slo 99.9 \\ + --total-requests 1000000 \\ + --failed-requests 1500 \\ + --period-days 30 + + # Calculate latency SLO + python3 slo_calculator.py latency \\ + --slo 99.5 \\ + --total-requests 500000 \\ + --slow-requests 3000 \\ + --period-days 7 + + # Calculate burn rate + python3 slo_calculator.py burn-rate \\ + --slo 99.9 \\ + --errors 50 \\ + --requests 10000 \\ + --window-hours 1 + """ + ) + + parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'], + help='Calculation mode') + parser.add_argument('--table', action='store_true', help='Show SLO reference table') + parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)') + parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)') + + # Availability SLO arguments + parser.add_argument('--total-requests', type=int, help='Total number of requests') + parser.add_argument('--failed-requests', type=int, help='Number of failed requests') + + # Latency SLO arguments + parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold') + + # Burn rate arguments + parser.add_argument('--errors', type=int, help='Number of errors in window') + parser.add_argument('--requests', type=int, help='Number of requests in window') + parser.add_argument('--window-hours', type=float, help='Window size in hours') + + args = parser.parse_args() + + # Show table if requested + if args.table: + SLOCalculator.print_slo_table() + return + + if not args.mode: + parser.print_help() + return + + if not args.slo: + print("❌ --slo required") + sys.exit(1) + + calculator = SLOCalculator(args.slo, args.period_days) + + if args.mode == 'availability': + if not args.total_requests or args.failed_requests is None: + print("❌ --total-requests and --failed-requests required") + sys.exit(1) + + results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests) + print_availability_results(results) + + elif args.mode == 'latency': + if not args.total_requests or args.slow_requests is None: + print("❌ --total-requests and --slow-requests required") + sys.exit(1) + + results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests) + print_availability_results(results) # Same format + + elif args.mode == 'burn-rate': + if not all([args.errors is not None, args.requests, args.window_hours]): + print("❌ --errors, --requests, and --window-hours required") + sys.exit(1) + + results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours) + print_burn_rate_results(results) + + +if __name__ == "__main__": + main()