Files
gh-greyhaven-ai-claude-code…/skills/observability-engineering/templates/slo-definition.yaml
2025-11-29 18:29:23 +08:00

174 lines
5.1 KiB
YAML

# SLO Definition Template
# Replace YOUR_SERVICE with actual service name
# Replace 99.9 with your target SLO (99.5, 99.9, or 99.95)
apiVersion: monitoring.greyhaven.io/v1
kind: ServiceLevelObjective
metadata:
name: YOUR_SERVICE-slo
namespace: production
spec:
# Service identification
service: YOUR_SERVICE
environment: production
# SLO tier (critical, essential, standard)
tier: essential
# Time window (30 days recommended)
window: 30d
# SLO targets
objectives:
- name: availability
target: 99.9 # 99.9% = 43.2 min downtime/month
indicator:
type: ratio
success_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[{{.window}}]))
total_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}]))
- name: latency
target: 95 # 95% of requests < 200ms
indicator:
type: ratio
success_query: |
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[{{.window}}]))
total_query: |
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[{{.window}}]))
- name: error_rate
target: 99.5 # <0.5% error rate
indicator:
type: ratio
success_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status!~"5.."}[{{.window}}]))
total_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}]))
# Error budget policy
errorBudget:
policy:
- budget_range: [75%, 100%]
action: "Normal feature development"
approval: "Engineering team"
- budget_range: [50%, 75%]
action: "Monitor closely, increase testing"
approval: "Engineering team"
- budget_range: [25%, 50%]
action: "Prioritize reliability work, reduce risky changes"
approval: "Engineering manager"
- budget_range: [0%, 25%]
action: "Feature freeze, all hands on reliability"
approval: "VP Engineering"
requirements:
- "Daily reliability standup"
- "Postmortem for all incidents"
- "No new features until budget >50%"
- budget_range: [0%, 0%]
action: "SLO violation - mandatory postmortem"
approval: "VP Engineering + CTO"
requirements:
- "Complete postmortem within 48 hours"
- "Action items with owners and deadlines"
- "Present to exec team"
# Multi-window burn rate alerts
alerts:
- name: error-budget-burn-rate-critical
severity: critical
windows:
short: 1h
long: 6h
burn_rate_threshold: 14.4 # Budget exhausted in 2 hours
for: 2m
annotations:
summary: "Critical burn rate - budget exhausted in 2 hours"
description: "Service {{ $labels.service }} is burning error budget 14.4x faster than expected"
runbook: "https://runbooks.greyhaven.io/slo-burn-rate"
notifications:
- type: pagerduty
severity: critical
- name: error-budget-burn-rate-high
severity: warning
windows:
short: 6h
long: 24h
burn_rate_threshold: 6 # Budget exhausted in 5 days
for: 15m
annotations:
summary: "High burn rate - budget exhausted in 5 days"
description: "Service {{ $labels.service }} is burning error budget 6x faster than expected"
notifications:
- type: slack
channel: "#alerts-reliability"
- name: error-budget-burn-rate-medium
severity: warning
windows:
short: 24h
long: 24h
burn_rate_threshold: 3 # Budget exhausted in 10 days
for: 1h
annotations:
summary: "Medium burn rate - budget exhausted in 10 days"
notifications:
- type: slack
channel: "#alerts-reliability"
- name: error-budget-low
severity: warning
threshold: 0.25 # 25% remaining
for: 5m
annotations:
summary: "Error budget low ({{ $value | humanizePercentage }} remaining)"
description: "Consider feature freeze per error budget policy"
notifications:
- type: slack
channel: "#engineering-managers"
- name: error-budget-depleted
severity: critical
threshold: 0 # 0% remaining
for: 5m
annotations:
summary: "Error budget depleted - feature freeze required"
description: "SLO violated. Postmortem required within 48 hours."
notifications:
- type: pagerduty
severity: critical
- type: slack
channel: "#exec-alerts"
# Review cadence
review:
frequency: weekly
participants:
- team: engineering
- team: product
- team: sre
agenda:
- "Current error budget status"
- "Burn rate trends"
- "Recent incidents and impact"
- "Upcoming risky changes"
# Reporting
reporting:
dashboard:
grafana_uid: YOUR_SERVICE_slo_dashboard
panels:
- slo_status
- error_budget_remaining
- burn_rate_multiwindow
- incident_timeline
export:
format: prometheus
recording_rules: true