Files
gh-greyhaven-ai-claude-code…/skills/observability-engineering/templates/prometheus-recording-rules.yaml
2025-11-29 18:29:23 +08:00

189 lines
6.7 KiB
YAML

# Prometheus Recording Rules Template
# Pre-aggregated metrics for fast dashboard queries and SLO tracking
# Replace YOUR_SERVICE with actual service name
groups:
# HTTP Request Rates
- name: http_request_rates
interval: 15s
rules:
# Total request rate (per-second)
- record: greyhaven:http_requests:rate5m
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
# Request rate by service
- record: greyhaven:http_requests:rate5m:by_service
expr: sum(rate(http_requests_total[5m])) by (service)
# Request rate by endpoint
- record: greyhaven:http_requests:rate5m:by_endpoint
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
# Request rate by method
- record: greyhaven:http_requests:rate5m:by_method
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method)
# Request rate by status code
- record: greyhaven:http_requests:rate5m:by_status
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status)
# HTTP Error Rates
- name: http_error_rates
interval: 15s
rules:
# Error rate (percentage)
- record: greyhaven:http_errors:rate5m
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
# Error rate by service
- record: greyhaven:http_errors:rate5m:by_service
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
# Error rate by endpoint
- record: greyhaven:http_errors:rate5m:by_endpoint
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint)
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
# HTTP Latency (Duration)
- name: http_latency
interval: 15s
rules:
# p50 latency (median)
- record: greyhaven:http_latency:p50
expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
# p95 latency
- record: greyhaven:http_latency:p95
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
# p99 latency
- record: greyhaven:http_latency:p99
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
# Average latency
- record: greyhaven:http_latency:avg
expr: |
sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m]))
/
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m]))
# p95 latency by endpoint
- record: greyhaven:http_latency:p95:by_endpoint
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint))
# Resource Saturation
- name: resource_saturation
interval: 15s
rules:
# CPU usage percentage
- record: greyhaven:cpu_usage:percent
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage percentage
- record: greyhaven:memory_usage:percent
expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
# Disk usage percentage
- record: greyhaven:disk_usage:percent
expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
# Database connection pool saturation
- record: greyhaven:db_pool:saturation
expr: |
db_pool_connections_active{service="YOUR_SERVICE"}
/
db_pool_connections_max{service="YOUR_SERVICE"}
# SLI Calculations (Multi-Window)
- name: sli_calculations
interval: 30s
rules:
# Availability SLI - 1 hour window
- record: greyhaven:sli:availability:1h
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h]))
# Availability SLI - 6 hour window
- record: greyhaven:sli:availability:6h
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h]))
# Availability SLI - 24 hour window
- record: greyhaven:sli:availability:24h
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h]))
# Availability SLI - 30 day window
- record: greyhaven:sli:availability:30d
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d]))
# Latency SLI - 1 hour window (% requests < 200ms)
- record: greyhaven:sli:latency:1h
expr: |
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h]))
/
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h]))
# Latency SLI - 30 day window
- record: greyhaven:sli:latency:30d
expr: |
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d]))
/
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d]))
# Error Budget Tracking
- name: error_budget
interval: 30s
rules:
# Error budget remaining (for 99.9% SLO)
- record: greyhaven:error_budget:remaining:30d
expr: |
1 - (
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"})
/
(1 - 0.999)
)
# Error budget burn rate - 1 hour window
- record: greyhaven:error_budget:burn_rate:1h
expr: |
(1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"})
/
(1 - 0.999)
# Error budget burn rate - 6 hour window
- record: greyhaven:error_budget:burn_rate:6h
expr: |
(1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"})
/
(1 - 0.999)
# Error budget burn rate - 24 hour window
- record: greyhaven:error_budget:burn_rate:24h
expr: |
(1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"})
/
(1 - 0.999)
# Error budget consumed (minutes of downtime)
- record: greyhaven:error_budget:consumed:30d
expr: |
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200