Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:29:23 +08:00
commit ebc71f5387
37 changed files with 9382 additions and 0 deletions

View File

@@ -0,0 +1,72 @@
# Observability Templates
Copy-paste ready configuration templates for Prometheus, Grafana, and OpenTelemetry.
## Templates Overview
### Grafana Dashboard Template
**File**: [grafana-dashboard.json](grafana-dashboard.json)
Production-ready Golden Signals dashboard:
- **Request Rate**: Total RPS with 5-minute averages
- **Error Rate**: Percentage of 5xx errors with alert thresholds
- **Latency**: p50/p95/p99 percentiles in milliseconds
- **Saturation**: CPU and memory usage percentages
**Use when**: Creating new service dashboards, standardizing monitoring
---
### SLO Definition Template
**File**: [slo-definition.yaml](slo-definition.yaml)
Service Level Objective configuration:
- **SLO tiers**: Critical (99.95%), Essential (99.9%), Standard (99.5%)
- **SLI definitions**: Availability, latency, error rate
- **Error budget policy**: Feature freeze thresholds
- **Multi-window burn rate alerts**: 1h, 6h, 24h windows
**Use when**: Implementing SLO framework for new services
---
### Prometheus Recording Rules
**File**: [prometheus-recording-rules.yaml](prometheus-recording-rules.yaml)
Pre-aggregated metrics for fast dashboards:
- **Request rates**: Per-service, per-endpoint RPS
- **Error rates**: Percentage calculations (5xx / total)
- **Latency percentiles**: p50/p95/p99 pre-computed
- **Error budget**: Remaining budget and burn rate
**Use when**: Optimizing slow dashboard queries, implementing SLOs
---
## Quick Usage
```bash
# Copy template to your monitoring directory
cp templates/grafana-dashboard.json ../monitoring/dashboards/
# Edit service name and thresholds
vim ../monitoring/dashboards/grafana-dashboard.json
# Import to Grafana
curl -X POST http://admin:password@localhost:3000/api/dashboards/db \
-H "Content-Type: application/json" \
-d @../monitoring/dashboards/grafana-dashboard.json
```
## Related Documentation
- **Examples**: [Examples Index](../examples/INDEX.md) - Full implementations
- **Reference**: [Reference Index](../reference/INDEX.md) - PromQL, SLO guides
- **Main Agent**: [observability-engineer.md](../observability-engineer.md) - Observability agent
---
Return to [main agent](../observability-engineer.md)

View File

@@ -0,0 +1,210 @@
{
"dashboard": {
"title": "Golden Signals - [Service Name]",
"tags": ["golden-signals", "production", "slo"],
"timezone": "UTC",
"refresh": "30s",
"time": {
"from": "now-6h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Request Rate (RPS)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"targets": [
{
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))",
"legendFormat": "Total RPS",
"refId": "A"
},
{
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m])) by (method)",
"legendFormat": "{{method}}",
"refId": "B"
}
],
"yaxes": [
{"format": "reqps", "label": "Requests/sec"},
{"format": "short"}
],
"legend": {"show": true, "alignAsTable": true, "avg": true, "max": true, "current": true}
},
{
"id": 2,
"title": "Error Rate (%)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"targets": [
{
"expr": "(sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))) * 100",
"legendFormat": "Error Rate %",
"refId": "A"
}
],
"yaxes": [
{"format": "percent", "label": "Error %", "max": 5},
{"format": "short"}
],
"alert": {
"name": "High Error Rate",
"conditions": [
{
"evaluator": {"params": [1], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A", "5m", "now"]},
"type": "query"
}
],
"frequency": "1m",
"for": "5m",
"message": "Error rate > 1% for 5 minutes",
"noDataState": "no_data",
"notifications": []
},
"thresholds": [
{"value": 1, "colorMode": "critical", "op": "gt", "fill": true, "line": true}
]
},
{
"id": 3,
"title": "Request Latency (p50/p95/p99)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
"legendFormat": "p99",
"refId": "C"
}
],
"yaxes": [
{"format": "ms", "label": "Latency (ms)"},
{"format": "short"}
],
"thresholds": [
{"value": 200, "colorMode": "warning", "op": "gt"},
{"value": 500, "colorMode": "critical", "op": "gt"}
]
},
{
"id": 4,
"title": "Resource Saturation (CPU/Memory %)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
},
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "Memory %",
"refId": "B"
}
],
"yaxes": [
{"format": "percent", "label": "Usage %", "max": 100},
{"format": "short"}
],
"thresholds": [
{"value": 80, "colorMode": "warning", "op": "gt"},
{"value": 90, "colorMode": "critical", "op": "gt"}
]
},
{
"id": 5,
"title": "Top 10 Slowest Endpoints",
"type": "table",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"targets": [
{
"expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le, path))) * 1000",
"legendFormat": "",
"format": "table",
"instant": true,
"refId": "A"
}
],
"transformations": [
{"id": "organize", "options": {"excludeByName": {}, "indexByName": {}, "renameByName": {"path": "Endpoint", "Value": "p95 Latency (ms)"}}}
]
},
{
"id": 6,
"title": "SLO Status (30-day)",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
"targets": [
{
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])) * 100",
"refId": "A"
}
],
"options": {
"graphMode": "none",
"textMode": "value_and_name",
"colorMode": "background"
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 3,
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 99.5, "color": "yellow"},
{"value": 99.9, "color": "green"}
]
}
}
}
},
{
"id": 7,
"title": "Error Budget Remaining",
"type": "gauge",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
"targets": [
{
"expr": "(1 - ((1 - (sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])))) / (1 - 0.999))) * 100",
"refId": "A"
}
],
"options": {
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 25, "color": "yellow"},
{"value": 50, "color": "green"}
]
}
}
}
}
]
}
}

View File

@@ -0,0 +1,188 @@
# Prometheus Recording Rules Template
# Pre-aggregated metrics for fast dashboard queries and SLO tracking
# Replace YOUR_SERVICE with actual service name
groups:
# HTTP Request Rates
- name: http_request_rates
interval: 15s
rules:
# Total request rate (per-second)
- record: greyhaven:http_requests:rate5m
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
# Request rate by service
- record: greyhaven:http_requests:rate5m:by_service
expr: sum(rate(http_requests_total[5m])) by (service)
# Request rate by endpoint
- record: greyhaven:http_requests:rate5m:by_endpoint
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
# Request rate by method
- record: greyhaven:http_requests:rate5m:by_method
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (method)
# Request rate by status code
- record: greyhaven:http_requests:rate5m:by_status
expr: sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (status)
# HTTP Error Rates
- name: http_error_rates
interval: 15s
rules:
# Error rate (percentage)
- record: greyhaven:http_errors:rate5m
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m]))
# Error rate by service
- record: greyhaven:http_errors:rate5m:by_service
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
# Error rate by endpoint
- record: greyhaven:http_errors:rate5m:by_endpoint
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"5.."}[5m])) by (endpoint)
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[5m])) by (endpoint)
# HTTP Latency (Duration)
- name: http_latency
interval: 15s
rules:
# p50 latency (median)
- record: greyhaven:http_latency:p50
expr: histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
# p95 latency
- record: greyhaven:http_latency:p95
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
# p99 latency
- record: greyhaven:http_latency:p99
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le))
# Average latency
- record: greyhaven:http_latency:avg
expr: |
sum(rate(http_request_duration_seconds_sum{service="YOUR_SERVICE"}[5m]))
/
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[5m]))
# p95 latency by endpoint
- record: greyhaven:http_latency:p95:by_endpoint
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE"}[5m])) by (le, endpoint))
# Resource Saturation
- name: resource_saturation
interval: 15s
rules:
# CPU usage percentage
- record: greyhaven:cpu_usage:percent
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage percentage
- record: greyhaven:memory_usage:percent
expr: 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
# Disk usage percentage
- record: greyhaven:disk_usage:percent
expr: 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
# Database connection pool saturation
- record: greyhaven:db_pool:saturation
expr: |
db_pool_connections_active{service="YOUR_SERVICE"}
/
db_pool_connections_max{service="YOUR_SERVICE"}
# SLI Calculations (Multi-Window)
- name: sli_calculations
interval: 30s
rules:
# Availability SLI - 1 hour window
- record: greyhaven:sli:availability:1h
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[1h]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[1h]))
# Availability SLI - 6 hour window
- record: greyhaven:sli:availability:6h
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[6h]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[6h]))
# Availability SLI - 24 hour window
- record: greyhaven:sli:availability:24h
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[24h]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[24h]))
# Availability SLI - 30 day window
- record: greyhaven:sli:availability:30d
expr: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[30d]))
/
sum(rate(http_requests_total{service="YOUR_SERVICE"}[30d]))
# Latency SLI - 1 hour window (% requests < 200ms)
- record: greyhaven:sli:latency:1h
expr: |
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[1h]))
/
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[1h]))
# Latency SLI - 30 day window
- record: greyhaven:sli:latency:30d
expr: |
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[30d]))
/
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[30d]))
# Error Budget Tracking
- name: error_budget
interval: 30s
rules:
# Error budget remaining (for 99.9% SLO)
- record: greyhaven:error_budget:remaining:30d
expr: |
1 - (
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"})
/
(1 - 0.999)
)
# Error budget burn rate - 1 hour window
- record: greyhaven:error_budget:burn_rate:1h
expr: |
(1 - greyhaven:sli:availability:1h{service="YOUR_SERVICE"})
/
(1 - 0.999)
# Error budget burn rate - 6 hour window
- record: greyhaven:error_budget:burn_rate:6h
expr: |
(1 - greyhaven:sli:availability:6h{service="YOUR_SERVICE"})
/
(1 - 0.999)
# Error budget burn rate - 24 hour window
- record: greyhaven:error_budget:burn_rate:24h
expr: |
(1 - greyhaven:sli:availability:24h{service="YOUR_SERVICE"})
/
(1 - 0.999)
# Error budget consumed (minutes of downtime)
- record: greyhaven:error_budget:consumed:30d
expr: |
(1 - greyhaven:sli:availability:30d{service="YOUR_SERVICE"}) * 43200

View File

@@ -0,0 +1,173 @@
# SLO Definition Template
# Replace YOUR_SERVICE with actual service name
# Replace 99.9 with your target SLO (99.5, 99.9, or 99.95)
apiVersion: monitoring.greyhaven.io/v1
kind: ServiceLevelObjective
metadata:
name: YOUR_SERVICE-slo
namespace: production
spec:
# Service identification
service: YOUR_SERVICE
environment: production
# SLO tier (critical, essential, standard)
tier: essential
# Time window (30 days recommended)
window: 30d
# SLO targets
objectives:
- name: availability
target: 99.9 # 99.9% = 43.2 min downtime/month
indicator:
type: ratio
success_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status=~"2..|3.."}[{{.window}}]))
total_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}]))
- name: latency
target: 95 # 95% of requests < 200ms
indicator:
type: ratio
success_query: |
sum(rate(http_request_duration_seconds_bucket{service="YOUR_SERVICE",le="0.2"}[{{.window}}]))
total_query: |
sum(rate(http_request_duration_seconds_count{service="YOUR_SERVICE"}[{{.window}}]))
- name: error_rate
target: 99.5 # <0.5% error rate
indicator:
type: ratio
success_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE",status!~"5.."}[{{.window}}]))
total_query: |
sum(rate(http_requests_total{service="YOUR_SERVICE"}[{{.window}}]))
# Error budget policy
errorBudget:
policy:
- budget_range: [75%, 100%]
action: "Normal feature development"
approval: "Engineering team"
- budget_range: [50%, 75%]
action: "Monitor closely, increase testing"
approval: "Engineering team"
- budget_range: [25%, 50%]
action: "Prioritize reliability work, reduce risky changes"
approval: "Engineering manager"
- budget_range: [0%, 25%]
action: "Feature freeze, all hands on reliability"
approval: "VP Engineering"
requirements:
- "Daily reliability standup"
- "Postmortem for all incidents"
- "No new features until budget >50%"
- budget_range: [0%, 0%]
action: "SLO violation - mandatory postmortem"
approval: "VP Engineering + CTO"
requirements:
- "Complete postmortem within 48 hours"
- "Action items with owners and deadlines"
- "Present to exec team"
# Multi-window burn rate alerts
alerts:
- name: error-budget-burn-rate-critical
severity: critical
windows:
short: 1h
long: 6h
burn_rate_threshold: 14.4 # Budget exhausted in 2 hours
for: 2m
annotations:
summary: "Critical burn rate - budget exhausted in 2 hours"
description: "Service {{ $labels.service }} is burning error budget 14.4x faster than expected"
runbook: "https://runbooks.greyhaven.io/slo-burn-rate"
notifications:
- type: pagerduty
severity: critical
- name: error-budget-burn-rate-high
severity: warning
windows:
short: 6h
long: 24h
burn_rate_threshold: 6 # Budget exhausted in 5 days
for: 15m
annotations:
summary: "High burn rate - budget exhausted in 5 days"
description: "Service {{ $labels.service }} is burning error budget 6x faster than expected"
notifications:
- type: slack
channel: "#alerts-reliability"
- name: error-budget-burn-rate-medium
severity: warning
windows:
short: 24h
long: 24h
burn_rate_threshold: 3 # Budget exhausted in 10 days
for: 1h
annotations:
summary: "Medium burn rate - budget exhausted in 10 days"
notifications:
- type: slack
channel: "#alerts-reliability"
- name: error-budget-low
severity: warning
threshold: 0.25 # 25% remaining
for: 5m
annotations:
summary: "Error budget low ({{ $value | humanizePercentage }} remaining)"
description: "Consider feature freeze per error budget policy"
notifications:
- type: slack
channel: "#engineering-managers"
- name: error-budget-depleted
severity: critical
threshold: 0 # 0% remaining
for: 5m
annotations:
summary: "Error budget depleted - feature freeze required"
description: "SLO violated. Postmortem required within 48 hours."
notifications:
- type: pagerduty
severity: critical
- type: slack
channel: "#exec-alerts"
# Review cadence
review:
frequency: weekly
participants:
- team: engineering
- team: product
- team: sre
agenda:
- "Current error budget status"
- "Burn rate trends"
- "Recent incidents and impact"
- "Upcoming risky changes"
# Reporting
reporting:
dashboard:
grafana_uid: YOUR_SERVICE_slo_dashboard
panels:
- slo_status
- error_budget_remaining
- burn_rate_multiwindow
- incident_timeline
export:
format: prometheus
recording_rules: true