189 lines
6.7 KiB
YAML
189 lines
6.7 KiB
YAML
# Prometheus Recording Rules Template
|
|
# Pre-aggregated metrics for fast dashboard queries and SLO tracking
|
|
# Replace YOUR_SERVICE with actual service name
|
|
|
|
groups:
|
|
# HTTP Request Rates
|
|
- name: http_request_rates
|
|
interval: 15s
|
|
rules:
|
|
# Total request rate (per-second)
|
|
- record: greyhaven:http_requests:rate5m
|
|
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
|
|
|
|
# Request rate by service
|
|
- record: greyhaven:http_requests:rate5m:by_service
|
|
expr: sum(rate(http_requests_total[5m])) by (service)
|
|
|
|
# Request rate by endpoint
|
|
- record: greyhaven:http_requests:rate5m:by_endpoint
|
|
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
|
|
|
|
# Request rate by method
|
|
- record: greyhaven:http_requests:rate5m:by_method
|
|
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method)
|
|
|
|
# Request rate by status code
|
|
- record: greyhaven:http_requests:rate5m:by_status
|
|
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status)
|
|
|
|
# HTTP Error Rates
|
|
- name: http_error_rates
|
|
interval: 15s
|
|
rules:
|
|
# Error rate (percentage)
|
|
- record: greyhaven:http_errors:rate5m
|
|
expr: |
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m]))
|
|
/
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
|
|
|
|
# Error rate by service
|
|
- record: greyhaven:http_errors:rate5m:by_service
|
|
expr: |
|
|
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
|
|
/
|
|
sum(rate(http_requests_total[5m])) by (service)
|
|
|
|
# Error rate by endpoint
|
|
- record: greyhaven:http_errors:rate5m:by_endpoint
|
|
expr: |
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint)
|
|
/
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
|
|
|
|
# HTTP Latency (Duration)
|
|
- name: http_latency
|
|
interval: 15s
|
|
rules:
|
|
# p50 latency (median)
|
|
- record: greyhaven:http_latency:p50
|
|
expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
|
|
|
|
# p95 latency
|
|
- record: greyhaven:http_latency:p95
|
|
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
|
|
|
|
# p99 latency
|
|
- record: greyhaven:http_latency:p99
|
|
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
|
|
|
|
# Average latency
|
|
- record: greyhaven:http_latency:avg
|
|
expr: |
|
|
sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m]))
|
|
/
|
|
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m]))
|
|
|
|
# p95 latency by endpoint
|
|
- record: greyhaven:http_latency:p95:by_endpoint
|
|
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint))
|
|
|
|
# Resource Saturation
|
|
- name: resource_saturation
|
|
interval: 15s
|
|
rules:
|
|
# CPU usage percentage
|
|
- record: greyhaven:cpu_usage:percent
|
|
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
|
|
# Memory usage percentage
|
|
- record: greyhaven:memory_usage:percent
|
|
expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
|
|
|
|
# Disk usage percentage
|
|
- record: greyhaven:disk_usage:percent
|
|
expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
|
|
|
|
# Database connection pool saturation
|
|
- record: greyhaven:db_pool:saturation
|
|
expr: |
|
|
db_pool_connections_active{service="YOUR_SERVICE"}
|
|
/
|
|
db_pool_connections_max{service="YOUR_SERVICE"}
|
|
|
|
# SLI Calculations (Multi-Window)
|
|
- name: sli_calculations
|
|
interval: 30s
|
|
rules:
|
|
# Availability SLI - 1 hour window
|
|
- record: greyhaven:sli:availability:1h
|
|
expr: |
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h]))
|
|
/
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h]))
|
|
|
|
# Availability SLI - 6 hour window
|
|
- record: greyhaven:sli:availability:6h
|
|
expr: |
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h]))
|
|
/
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h]))
|
|
|
|
# Availability SLI - 24 hour window
|
|
- record: greyhaven:sli:availability:24h
|
|
expr: |
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h]))
|
|
/
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h]))
|
|
|
|
# Availability SLI - 30 day window
|
|
- record: greyhaven:sli:availability:30d
|
|
expr: |
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d]))
|
|
/
|
|
sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d]))
|
|
|
|
# Latency SLI - 1 hour window (% requests < 200ms)
|
|
- record: greyhaven:sli:latency:1h
|
|
expr: |
|
|
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h]))
|
|
/
|
|
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h]))
|
|
|
|
# Latency SLI - 30 day window
|
|
- record: greyhaven:sli:latency:30d
|
|
expr: |
|
|
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d]))
|
|
/
|
|
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d]))
|
|
|
|
# Error Budget Tracking
|
|
- name: error_budget
|
|
interval: 30s
|
|
rules:
|
|
# Error budget remaining (for 99.9% SLO)
|
|
- record: greyhaven:error_budget:remaining:30d
|
|
expr: |
|
|
1 - (
|
|
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"})
|
|
/
|
|
(1 - 0.999)
|
|
)
|
|
|
|
# Error budget burn rate - 1 hour window
|
|
- record: greyhaven:error_budget:burn_rate:1h
|
|
expr: |
|
|
(1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"})
|
|
/
|
|
(1 - 0.999)
|
|
|
|
# Error budget burn rate - 6 hour window
|
|
- record: greyhaven:error_budget:burn_rate:6h
|
|
expr: |
|
|
(1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"})
|
|
/
|
|
(1 - 0.999)
|
|
|
|
# Error budget burn rate - 24 hour window
|
|
- record: greyhaven:error_budget:burn_rate:24h
|
|
expr: |
|
|
(1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"})
|
|
/
|
|
(1 - 0.999)
|
|
|
|
# Error budget consumed (minutes of downtime)
|
|
- record: greyhaven:error_budget:consumed:30d
|
|
expr: |
|
|
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200
|