Initial commit

2025-11-29 18:29:23 +08:00
commit ebc71f5387
37 changed files with 9382 additions and 0 deletions
--- a/skills/observability-engineering/templates/INDEX.md
+++ b/skills/observability-engineering/templates/INDEX.md
@@ -0,0 +1,72 @@
+# Observability Templates
+
+Copy-paste ready configuration templates for Prometheus, Grafana, and OpenTelemetry.
+
+## Templates Overview
+
+### Grafana Dashboard Template
+
+**File**: [grafana-dashboard.json](grafana-dashboard.json)
+
+Production-ready Golden Signals dashboard:
+- **Request Rate**: Total RPS with 5-minute averages
+- **Error Rate**: Percentage of 5xx errors with alert thresholds
+- **Latency**: p50/p95/p99 percentiles in milliseconds
+- **Saturation**: CPU and memory usage percentages
+
+**Use when**: Creating new service dashboards, standardizing monitoring
+
+---
+
+### SLO Definition Template
+
+**File**: [slo-definition.yaml](slo-definition.yaml)
+
+Service Level Objective configuration:
+- **SLO tiers**: Critical (99.95%), Essential (99.9%), Standard (99.5%)
+- **SLI definitions**: Availability, latency, error rate
+- **Error budget policy**: Feature freeze thresholds
+- **Multi-window burn rate alerts**: 1h, 6h, 24h windows
+
+**Use when**: Implementing SLO framework for new services
+
+---
+
+### Prometheus Recording Rules
+
+**File**: [prometheus-recording-rules.yaml](prometheus-recording-rules.yaml)
+
+Pre-aggregated metrics for fast dashboards:
+- **Request rates**: Per-service, per-endpoint RPS
+- **Error rates**: Percentage calculations (5xx / total)
+- **Latency percentiles**: p50/p95/p99 pre-computed
+- **Error budget**: Remaining budget and burn rate
+
+**Use when**: Optimizing slow dashboard queries, implementing SLOs
+
+---
+
+## Quick Usage
+
+```bash
+# Copy template to your monitoring directory
+cp templates/grafana-dashboard.json ../monitoring/dashboards/
+
+# Edit service name and thresholds
+vim ../monitoring/dashboards/grafana-dashboard.json
+
+# Import to Grafana
+curl -X POST http://admin:password@localhost:3000/api/dashboards/db \
+  -H "Content-Type: application/json" \
+  -d @../monitoring/dashboards/grafana-dashboard.json
+```
+
+## Related Documentation
+
+- **Examples**: [Examples Index](../examples/INDEX.md) - Full implementations
+- **Reference**: [Reference Index](../reference/INDEX.md) - PromQL, SLO guides
+- **Main Agent**: [observability-engineer.md](../observability-engineer.md) - Observability agent
+
+---
+
+Return to [main agent](../observability-engineer.md)
--- a/skills/observability-engineering/templates/grafana-dashboard.json
+++ b/skills/observability-engineering/templates/grafana-dashboard.json
@@ -0,0 +1,210 @@
+{
+  "dashboard": {
+    "title": "Golden Signals - [Service Name]",
+    "tags": ["golden-signals", "production", "slo"],
+    "timezone": "UTC",
+    "refresh": "30s",
+    "time": {
+      "from": "now-6h",
+      "to": "now"
+    },
+    "panels": [
+      {
+        "id": 1,
+        "title": "Request Rate (RPS)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))",
+            "legendFormat": "Total RPS",
+            "refId": "A"
+          },
+          {
+            "expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m])) by (method)",
+            "legendFormat": "{{method}}",
+            "refId": "B"
+          }
+        ],
+        "yaxes": [
+          {"format": "reqps", "label": "Requests/sec"},
+          {"format": "short"}
+        ],
+        "legend": {"show": true, "alignAsTable": true, "avg": true, "max": true, "current": true}
+      },
+      {
+        "id": 2,
+        "title": "Error Rate (%)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+        "targets": [
+          {
+            "expr": "(sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))) * 100",
+            "legendFormat": "Error Rate %",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"format": "percent", "label": "Error %", "max": 5},
+          {"format": "short"}
+        ],
+        "alert": {
+          "name": "High Error Rate",
+          "conditions": [
+            {
+              "evaluator": {"params": [1], "type": "gt"},
+              "operator": {"type": "and"},
+              "query": {"params": ["A", "5m", "now"]},
+              "type": "query"
+            }
+          ],
+          "frequency": "1m",
+          "for": "5m",
+          "message": "Error rate > 1% for 5 minutes",
+          "noDataState": "no_data",
+          "notifications": []
+        },
+        "thresholds": [
+          {"value": 1, "colorMode": "critical", "op": "gt", "fill": true, "line": true}
+        ]
+      },
+      {
+        "id": 3,
+        "title": "Request Latency (p50/p95/p99)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p50",
+            "refId": "A"
+          },
+          {
+            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p95",
+            "refId": "B"
+          },
+          {
+            "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p99",
+            "refId": "C"
+          }
+        ],
+        "yaxes": [
+          {"format": "ms", "label": "Latency (ms)"},
+          {"format": "short"}
+        ],
+        "thresholds": [
+          {"value": 200, "colorMode": "warning", "op": "gt"},
+          {"value": 500, "colorMode": "critical", "op": "gt"}
+        ]
+      },
+      {
+        "id": 4,
+        "title": "Resource Saturation (CPU/Memory %)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+        "targets": [
+          {
+            "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
+            "legendFormat": "CPU %",
+            "refId": "A"
+          },
+          {
+            "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
+            "legendFormat": "Memory %",
+            "refId": "B"
+          }
+        ],
+        "yaxes": [
+          {"format": "percent", "label": "Usage %", "max": 100},
+          {"format": "short"}
+        ],
+        "thresholds": [
+          {"value": 80, "colorMode": "warning", "op": "gt"},
+          {"value": 90, "colorMode": "critical", "op": "gt"}
+        ]
+      },
+      {
+        "id": 5,
+        "title": "Top 10 Slowest Endpoints",
+        "type": "table",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+        "targets": [
+          {
+            "expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le, path))) * 1000",
+            "legendFormat": "",
+            "format": "table",
+            "instant": true,
+            "refId": "A"
+          }
+        ],
+        "transformations": [
+          {"id": "organize", "options": {"excludeByName": {}, "indexByName": {}, "renameByName": {"path": "Endpoint", "Value": "p95 Latency (ms)"}}}
+        ]
+      },
+      {
+        "id": 6,
+        "title": "SLO Status (30-day)",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])) * 100",
+            "refId": "A"
+          }
+        ],
+        "options": {
+          "graphMode": "none",
+          "textMode": "value_and_name",
+          "colorMode": "background"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "decimals": 3,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"value": 0, "color": "red"},
+                {"value": 99.5, "color": "yellow"},
+                {"value": 99.9, "color": "green"}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 7,
+        "title": "Error Budget Remaining",
+        "type": "gauge",
+        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
+        "targets": [
+          {
+            "expr": "(1 - ((1 - (sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])))) / (1 - 0.999))) * 100",
+            "refId": "A"
+          }
+        ],
+        "options": {
+          "showThresholdLabels": false,
+          "showThresholdMarkers": true
+        },
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "min": 0,
+            "max": 100,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"value": 0, "color": "red"},
+                {"value": 25, "color": "yellow"},
+                {"value": 50, "color": "green"}
+              ]
+            }
+          }
+        }
+      }
+    ]
+  }
+}
--- a/skills/observability-engineering/templates/prometheus-recording-rules.yaml
+++ b/skills/observability-engineering/templates/prometheus-recording-rules.yaml
@@ -0,0 +1,188 @@
+# Prometheus Recording Rules Template
+# Pre-aggregated metrics for fast dashboard queries and SLO tracking
+# Replace YOUR_SERVICE with actual service name
+
+groups:
+  # HTTP Request Rates
+  - name: http_request_rates
+    interval: 15s
+    rules:
+      # Total request rate (per-second)
+      - record: greyhaven:http_requests:rate5m
+        expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
+
+      # Request rate by service
+      - record: greyhaven:http_requests:rate5m:by_service
+        expr: sum(rate(http_requests_total[5m])) by (service)
+
+      # Request rate by endpoint
+      - record: greyhaven:http_requests:rate5m:by_endpoint
+        expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
+
+      # Request rate by method
+      - record: greyhaven:http_requests:rate5m:by_method
+        expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method)
+
+      # Request rate by status code
+      - record: greyhaven:http_requests:rate5m:by_status
+        expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status)
+
+  # HTTP Error Rates
+  - name: http_error_rates
+    interval: 15s
+    rules:
+      # Error rate (percentage)
+      - record: greyhaven:http_errors:rate5m
+        expr: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m]))
+          /
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
+
+      # Error rate by service
+      - record: greyhaven:http_errors:rate5m:by_service
+        expr: |
+          sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
+          /
+          sum(rate(http_requests_total[5m])) by (service)
+
+      # Error rate by endpoint
+      - record: greyhaven:http_errors:rate5m:by_endpoint
+        expr: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint)
+          /
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
+
+  # HTTP Latency (Duration)
+  - name: http_latency
+    interval: 15s
+    rules:
+      # p50 latency (median)
+      - record: greyhaven:http_latency:p50
+        expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
+
+      # p95 latency
+      - record: greyhaven:http_latency:p95
+        expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
+
+      # p99 latency
+      - record: greyhaven:http_latency:p99
+        expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
+
+      # Average latency
+      - record: greyhaven:http_latency:avg
+        expr: |
+          sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m]))
+          /
+          sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m]))
+
+      # p95 latency by endpoint
+      - record: greyhaven:http_latency:p95:by_endpoint
+        expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint))
+
+  # Resource Saturation
+  - name: resource_saturation
+    interval: 15s
+    rules:
+      # CPU usage percentage
+      - record: greyhaven:cpu_usage:percent
+        expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
+
+      # Memory usage percentage
+      - record: greyhaven:memory_usage:percent
+        expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
+
+      # Disk usage percentage
+      - record: greyhaven:disk_usage:percent
+        expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
+
+      # Database connection pool saturation
+      - record: greyhaven:db_pool:saturation
+        expr: |
+          db_pool_connections_active{service="YOUR_SERVICE"}
+          /
+          db_pool_connections_max{service="YOUR_SERVICE"}
+
+  # SLI Calculations (Multi-Window)
+  - name: sli_calculations
+    interval: 30s
+    rules:
+      # Availability SLI - 1 hour window
+      - record: greyhaven:sli:availability:1h
+        expr: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h]))
+          /
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h]))
+
+      # Availability SLI - 6 hour window
+      - record: greyhaven:sli:availability:6h
+        expr: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h]))
+          /
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h]))
+
+      # Availability SLI - 24 hour window
+      - record: greyhaven:sli:availability:24h
+        expr: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h]))
+          /
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h]))
+
+      # Availability SLI - 30 day window
+      - record: greyhaven:sli:availability:30d
+        expr: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d]))
+          /
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d]))
+
+      # Latency SLI - 1 hour window (% requests < 200ms)
+      - record: greyhaven:sli:latency:1h
+        expr: |
+          sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h]))
+          /
+          sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h]))
+
+      # Latency SLI - 30 day window
+      - record: greyhaven:sli:latency:30d
+        expr: |
+          sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d]))
+          /
+          sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d]))
+
+  # Error Budget Tracking
+  - name: error_budget
+    interval: 30s
+    rules:
+      # Error budget remaining (for 99.9% SLO)
+      - record: greyhaven:error_budget:remaining:30d
+        expr: |
+          1 - (
+            (1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"})
+            /
+            (1 - 0.999)
+          )
+
+      # Error budget burn rate - 1 hour window
+      - record: greyhaven:error_budget:burn_rate:1h
+        expr: |
+          (1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"})
+          /
+          (1 - 0.999)
+
+      # Error budget burn rate - 6 hour window
+      - record: greyhaven:error_budget:burn_rate:6h
+        expr: |
+          (1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"})
+          /
+          (1 - 0.999)
+
+      # Error budget burn rate - 24 hour window
+      - record: greyhaven:error_budget:burn_rate:24h
+        expr: |
+          (1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"})
+          /
+          (1 - 0.999)
+
+      # Error budget consumed (minutes of downtime)
+      - record: greyhaven:error_budget:consumed:30d
+        expr: |
+          (1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200
--- a/skills/observability-engineering/templates/slo-definition.yaml
+++ b/skills/observability-engineering/templates/slo-definition.yaml
@@ -0,0 +1,173 @@
+# SLO Definition Template
+# Replace YOUR_SERVICE with actual service name
+# Replace 99.9 with your target SLO (99.5, 99.9, or 99.95)
+
+apiVersion: monitoring.greyhaven.io/v1
+kind: ServiceLevelObjective
+metadata:
+  name: YOUR_SERVICE-slo
+  namespace: production
+spec:
+  # Service identification
+  service: YOUR_SERVICE
+  environment: production
+
+  # SLO tier (critical, essential, standard)
+  tier: essential
+
+  # Time window (30 days recommended)
+  window: 30d
+
+  # SLO targets
+  objectives:
+    - name: availability
+      target: 99.9  # 99.9% = 43.2 min downtime/month
+      indicator:
+        type: ratio
+        success_query: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[{{.window}}]))
+        total_query: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}]))
+
+    - name: latency
+      target: 95  # 95% of requests < 200ms
+      indicator:
+        type: ratio
+        success_query: |
+          sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[{{.window}}]))
+        total_query: |
+          sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[{{.window}}]))
+
+    - name: error_rate
+      target: 99.5  # <0.5% error rate
+      indicator:
+        type: ratio
+        success_query: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE",status!~"5.."}[{{.window}}]))
+        total_query: |
+          sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}]))
+
+  # Error budget policy
+  errorBudget:
+    policy:
+      - budget_range: [75%, 100%]
+        action: "Normal feature development"
+        approval: "Engineering team"
+
+      - budget_range: [50%, 75%]
+        action: "Monitor closely, increase testing"
+        approval: "Engineering team"
+
+      - budget_range: [25%, 50%]
+        action: "Prioritize reliability work, reduce risky changes"
+        approval: "Engineering manager"
+
+      - budget_range: [0%, 25%]
+        action: "Feature freeze, all hands on reliability"
+        approval: "VP Engineering"
+        requirements:
+          - "Daily reliability standup"
+          - "Postmortem for all incidents"
+          - "No new features until budget >50%"
+
+      - budget_range: [0%, 0%]
+        action: "SLO violation - mandatory postmortem"
+        approval: "VP Engineering + CTO"
+        requirements:
+          - "Complete postmortem within 48 hours"
+          - "Action items with owners and deadlines"
+          - "Present to exec team"
+
+  # Multi-window burn rate alerts
+  alerts:
+    - name: error-budget-burn-rate-critical
+      severity: critical
+      windows:
+        short: 1h
+        long: 6h
+      burn_rate_threshold: 14.4  # Budget exhausted in 2 hours
+      for: 2m
+      annotations:
+        summary: "Critical burn rate - budget exhausted in 2 hours"
+        description: "Service {{ $labels.service }} is burning error budget 14.4x faster than expected"
+        runbook: "https://runbooks.greyhaven.io/slo-burn-rate"
+      notifications:
+        - type: pagerduty
+          severity: critical
+
+    - name: error-budget-burn-rate-high
+      severity: warning
+      windows:
+        short: 6h
+        long: 24h
+      burn_rate_threshold: 6  # Budget exhausted in 5 days
+      for: 15m
+      annotations:
+        summary: "High burn rate - budget exhausted in 5 days"
+        description: "Service {{ $labels.service }} is burning error budget 6x faster than expected"
+      notifications:
+        - type: slack
+          channel: "#alerts-reliability"
+
+    - name: error-budget-burn-rate-medium
+      severity: warning
+      windows:
+        short: 24h
+        long: 24h
+      burn_rate_threshold: 3  # Budget exhausted in 10 days
+      for: 1h
+      annotations:
+        summary: "Medium burn rate - budget exhausted in 10 days"
+      notifications:
+        - type: slack
+          channel: "#alerts-reliability"
+
+    - name: error-budget-low
+      severity: warning
+      threshold: 0.25  # 25% remaining
+      for: 5m
+      annotations:
+        summary: "Error budget low ({{ $value | humanizePercentage }} remaining)"
+        description: "Consider feature freeze per error budget policy"
+      notifications:
+        - type: slack
+          channel: "#engineering-managers"
+
+    - name: error-budget-depleted
+      severity: critical
+      threshold: 0  # 0% remaining
+      for: 5m
+      annotations:
+        summary: "Error budget depleted - feature freeze required"
+        description: "SLO violated. Postmortem required within 48 hours."
+      notifications:
+        - type: pagerduty
+          severity: critical
+        - type: slack
+          channel: "#exec-alerts"
+
+  # Review cadence
+  review:
+    frequency: weekly
+    participants:
+      - team: engineering
+      - team: product
+      - team: sre
+    agenda:
+      - "Current error budget status"
+      - "Burn rate trends"
+      - "Recent incidents and impact"
+      - "Upcoming risky changes"
+
+  # Reporting
+  reporting:
+    dashboard:
+      grafana_uid: YOUR_SERVICE_slo_dashboard
+      panels:
+        - slo_status
+        - error_budget_remaining
+        - burn_rate_multiwindow
+        - incident_timeline
+    export:
+      format: prometheus
+      recording_rules: true