# SLO Definition Template # Replace YOUR_SERVICE with actual service name # Replace 99.9 with your target SLO (99.5, 99.9, or 99.95) apiVersion: monitoring.greyhaven.io/v1 kind: ServiceLevelObjective metadata: name: YOUR_SERVICE-slo namespace: production spec: # Service identification service: YOUR_SERVICE environment: production # SLO tier (critical, essential, standard) tier: essential # Time window (30 days recommended) window: 30d # SLO targets objectives: - name: availability target: 99.9 # 99.9% = 43.2 min downtime/month indicator: type: ratio success_query: | sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[{{.window}}])) total_query: | sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}])) - name: latency target: 95 # 95% of requests < 200ms indicator: type: ratio success_query: | sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[{{.window}}])) total_query: | sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[{{.window}}])) - name: error_rate target: 99.5 # <0.5% error rate indicator: type: ratio success_query: | sum(rate(http_requests_total{service="YOUR_SERVICE",status!~"5.."}[{{.window}}])) total_query: | sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}])) # Error budget policy errorBudget: policy: - budget_range: [75%, 100%] action: "Normal feature development" approval: "Engineering team" - budget_range: [50%, 75%] action: "Monitor closely, increase testing" approval: "Engineering team" - budget_range: [25%, 50%] action: "Prioritize reliability work, reduce risky changes" approval: "Engineering manager" - budget_range: [0%, 25%] action: "Feature freeze, all hands on reliability" approval: "VP Engineering" requirements: - "Daily reliability standup" - "Postmortem for all incidents" - "No new features until budget >50%" - budget_range: [0%, 0%] action: "SLO violation - mandatory postmortem" approval: "VP Engineering + CTO" requirements: - "Complete postmortem within 48 hours" - "Action items with owners and deadlines" - "Present to exec team" # Multi-window burn rate alerts alerts: - name: error-budget-burn-rate-critical severity: critical windows: short: 1h long: 6h burn_rate_threshold: 14.4 # Budget exhausted in 2 hours for: 2m annotations: summary: "Critical burn rate - budget exhausted in 2 hours" description: "Service {{ $labels.service }} is burning error budget 14.4x faster than expected" runbook: "https://runbooks.greyhaven.io/slo-burn-rate" notifications: - type: pagerduty severity: critical - name: error-budget-burn-rate-high severity: warning windows: short: 6h long: 24h burn_rate_threshold: 6 # Budget exhausted in 5 days for: 15m annotations: summary: "High burn rate - budget exhausted in 5 days" description: "Service {{ $labels.service }} is burning error budget 6x faster than expected" notifications: - type: slack channel: "#alerts-reliability" - name: error-budget-burn-rate-medium severity: warning windows: short: 24h long: 24h burn_rate_threshold: 3 # Budget exhausted in 10 days for: 1h annotations: summary: "Medium burn rate - budget exhausted in 10 days" notifications: - type: slack channel: "#alerts-reliability" - name: error-budget-low severity: warning threshold: 0.25 # 25% remaining for: 5m annotations: summary: "Error budget low ({{ $value | humanizePercentage }} remaining)" description: "Consider feature freeze per error budget policy" notifications: - type: slack channel: "#engineering-managers" - name: error-budget-depleted severity: critical threshold: 0 # 0% remaining for: 5m annotations: summary: "Error budget depleted - feature freeze required" description: "SLO violated. Postmortem required within 48 hours." notifications: - type: pagerduty severity: critical - type: slack channel: "#exec-alerts" # Review cadence review: frequency: weekly participants: - team: engineering - team: product - team: sre agenda: - "Current error budget status" - "Burn rate trends" - "Recent incidents and impact" - "Upcoming risky changes" # Reporting reporting: dashboard: grafana_uid: YOUR_SERVICE_slo_dashboard panels: - slo_status - error_budget_remaining - burn_rate_multiwindow - incident_timeline export: format: prometheus recording_rules: true