Initial commit
This commit is contained in:
@@ -0,0 +1,188 @@
|
||||
# Prometheus Recording Rules Template
|
||||
# Pre-aggregated metrics for fast dashboard queries and SLO tracking
|
||||
# Replace YOUR_SERVICE with actual service name
|
||||
|
||||
groups:
|
||||
# HTTP Request Rates
|
||||
- name: http_request_rates
|
||||
interval: 15s
|
||||
rules:
|
||||
# Total request rate (per-second)
|
||||
- record: greyhaven:http_requests:rate5m
|
||||
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
|
||||
|
||||
# Request rate by service
|
||||
- record: greyhaven:http_requests:rate5m:by_service
|
||||
expr: sum(rate(http_requests_total[5m])) by (service)
|
||||
|
||||
# Request rate by endpoint
|
||||
- record: greyhaven:http_requests:rate5m:by_endpoint
|
||||
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
|
||||
|
||||
# Request rate by method
|
||||
- record: greyhaven:http_requests:rate5m:by_method
|
||||
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method)
|
||||
|
||||
# Request rate by status code
|
||||
- record: greyhaven:http_requests:rate5m:by_status
|
||||
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status)
|
||||
|
||||
# HTTP Error Rates
|
||||
- name: http_error_rates
|
||||
interval: 15s
|
||||
rules:
|
||||
# Error rate (percentage)
|
||||
- record: greyhaven:http_errors:rate5m
|
||||
expr: |
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
|
||||
|
||||
# Error rate by service
|
||||
- record: greyhaven:http_errors:rate5m:by_service
|
||||
expr: |
|
||||
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
|
||||
/
|
||||
sum(rate(http_requests_total[5m])) by (service)
|
||||
|
||||
# Error rate by endpoint
|
||||
- record: greyhaven:http_errors:rate5m:by_endpoint
|
||||
expr: |
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint)
|
||||
/
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
|
||||
|
||||
# HTTP Latency (Duration)
|
||||
- name: http_latency
|
||||
interval: 15s
|
||||
rules:
|
||||
# p50 latency (median)
|
||||
- record: greyhaven:http_latency:p50
|
||||
expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
|
||||
|
||||
# p95 latency
|
||||
- record: greyhaven:http_latency:p95
|
||||
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
|
||||
|
||||
# p99 latency
|
||||
- record: greyhaven:http_latency:p99
|
||||
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
|
||||
|
||||
# Average latency
|
||||
- record: greyhaven:http_latency:avg
|
||||
expr: |
|
||||
sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m]))
|
||||
/
|
||||
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m]))
|
||||
|
||||
# p95 latency by endpoint
|
||||
- record: greyhaven:http_latency:p95:by_endpoint
|
||||
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint))
|
||||
|
||||
# Resource Saturation
|
||||
- name: resource_saturation
|
||||
interval: 15s
|
||||
rules:
|
||||
# CPU usage percentage
|
||||
- record: greyhaven:cpu_usage:percent
|
||||
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
|
||||
# Memory usage percentage
|
||||
- record: greyhaven:memory_usage:percent
|
||||
expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
|
||||
|
||||
# Disk usage percentage
|
||||
- record: greyhaven:disk_usage:percent
|
||||
expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
|
||||
|
||||
# Database connection pool saturation
|
||||
- record: greyhaven:db_pool:saturation
|
||||
expr: |
|
||||
db_pool_connections_active{service="YOUR_SERVICE"}
|
||||
/
|
||||
db_pool_connections_max{service="YOUR_SERVICE"}
|
||||
|
||||
# SLI Calculations (Multi-Window)
|
||||
- name: sli_calculations
|
||||
interval: 30s
|
||||
rules:
|
||||
# Availability SLI - 1 hour window
|
||||
- record: greyhaven:sli:availability:1h
|
||||
expr: |
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h]))
|
||||
/
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h]))
|
||||
|
||||
# Availability SLI - 6 hour window
|
||||
- record: greyhaven:sli:availability:6h
|
||||
expr: |
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h]))
|
||||
/
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h]))
|
||||
|
||||
# Availability SLI - 24 hour window
|
||||
- record: greyhaven:sli:availability:24h
|
||||
expr: |
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h]))
|
||||
/
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h]))
|
||||
|
||||
# Availability SLI - 30 day window
|
||||
- record: greyhaven:sli:availability:30d
|
||||
expr: |
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d]))
|
||||
/
|
||||
sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d]))
|
||||
|
||||
# Latency SLI - 1 hour window (% requests < 200ms)
|
||||
- record: greyhaven:sli:latency:1h
|
||||
expr: |
|
||||
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h]))
|
||||
/
|
||||
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h]))
|
||||
|
||||
# Latency SLI - 30 day window
|
||||
- record: greyhaven:sli:latency:30d
|
||||
expr: |
|
||||
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d]))
|
||||
/
|
||||
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d]))
|
||||
|
||||
# Error Budget Tracking
|
||||
- name: error_budget
|
||||
interval: 30s
|
||||
rules:
|
||||
# Error budget remaining (for 99.9% SLO)
|
||||
- record: greyhaven:error_budget:remaining:30d
|
||||
expr: |
|
||||
1 - (
|
||||
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"})
|
||||
/
|
||||
(1 - 0.999)
|
||||
)
|
||||
|
||||
# Error budget burn rate - 1 hour window
|
||||
- record: greyhaven:error_budget:burn_rate:1h
|
||||
expr: |
|
||||
(1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"})
|
||||
/
|
||||
(1 - 0.999)
|
||||
|
||||
# Error budget burn rate - 6 hour window
|
||||
- record: greyhaven:error_budget:burn_rate:6h
|
||||
expr: |
|
||||
(1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"})
|
||||
/
|
||||
(1 - 0.999)
|
||||
|
||||
# Error budget burn rate - 24 hour window
|
||||
- record: greyhaven:error_budget:burn_rate:24h
|
||||
expr: |
|
||||
(1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"})
|
||||
/
|
||||
(1 - 0.999)
|
||||
|
||||
# Error budget consumed (minutes of downtime)
|
||||
- record: greyhaven:error_budget:consumed:30d
|
||||
expr: |
|
||||
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200
|
||||
Reference in New Issue
Block a user