# Prometheus Recording Rules Template # Pre-aggregated metrics for fast dashboard queries and SLO tracking # Replace YOUR_SERVICE with actual service name groups: # HTTP Request Rates - name: http_request_rates interval: 15s rules: # Total request rate (per-second) - record: greyhaven:http_requests:rate5m expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) # Request rate by service - record: greyhaven:http_requests:rate5m:by_service expr: sum(rate(http_requests_total[5m])) by (service) # Request rate by endpoint - record: greyhaven:http_requests:rate5m:by_endpoint expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint) # Request rate by method - record: greyhaven:http_requests:rate5m:by_method expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method) # Request rate by status code - record: greyhaven:http_requests:rate5m:by_status expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status) # HTTP Error Rates - name: http_error_rates interval: 15s rules: # Error rate (percentage) - record: greyhaven:http_errors:rate5m expr: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) / sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) # Error rate by service - record: greyhaven:http_errors:rate5m:by_service expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) # Error rate by endpoint - record: greyhaven:http_errors:rate5m:by_endpoint expr: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint) / sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint) # HTTP Latency (Duration) - name: http_latency interval: 15s rules: # p50 latency (median) - record: greyhaven:http_latency:p50 expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le)) # p95 latency - record: greyhaven:http_latency:p95 expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le)) # p99 latency - record: greyhaven:http_latency:p99 expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le)) # Average latency - record: greyhaven:http_latency:avg expr: | sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m])) / sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m])) # p95 latency by endpoint - record: greyhaven:http_latency:p95:by_endpoint expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint)) # Resource Saturation - name: resource_saturation interval: 15s rules: # CPU usage percentage - record: greyhaven:cpu_usage:percent expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) # Memory usage percentage - record: greyhaven:memory_usage:percent expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) # Disk usage percentage - record: greyhaven:disk_usage:percent expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100) # Database connection pool saturation - record: greyhaven:db_pool:saturation expr: | db_pool_connections_active{service="YOUR_SERVICE"} / db_pool_connections_max{service="YOUR_SERVICE"} # SLI Calculations (Multi-Window) - name: sli_calculations interval: 30s rules: # Availability SLI - 1 hour window - record: greyhaven:sli:availability:1h expr: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h])) / sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h])) # Availability SLI - 6 hour window - record: greyhaven:sli:availability:6h expr: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h])) / sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h])) # Availability SLI - 24 hour window - record: greyhaven:sli:availability:24h expr: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h])) / sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h])) # Availability SLI - 30 day window - record: greyhaven:sli:availability:30d expr: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d])) / sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d])) # Latency SLI - 1 hour window (% requests < 200ms) - record: greyhaven:sli:latency:1h expr: | sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h])) / sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h])) # Latency SLI - 30 day window - record: greyhaven:sli:latency:30d expr: | sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d])) / sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d])) # Error Budget Tracking - name: error_budget interval: 30s rules: # Error budget remaining (for 99.9% SLO) - record: greyhaven:error_budget:remaining:30d expr: | 1 - ( (1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) / (1 - 0.999) ) # Error budget burn rate - 1 hour window - record: greyhaven:error_budget:burn_rate:1h expr: | (1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"}) / (1 - 0.999) # Error budget burn rate - 6 hour window - record: greyhaven:error_budget:burn_rate:6h expr: | (1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"}) / (1 - 0.999) # Error budget burn rate - 24 hour window - record: greyhaven:error_budget:burn_rate:24h expr: | (1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"}) / (1 - 0.999) # Error budget consumed (minutes of downtime) - record: greyhaven:error_budget:consumed:30d expr: | (1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200