Files
gh-greyhaven-ai-claude-code…/skills/observability-engineering/templates/grafana-dashboard.json
2025-11-29 18:29:23 +08:00

211 lines
6.6 KiB
JSON

{
"dashboard": {
"title": "Golden Signals - [Service Name]",
"tags": ["golden-signals", "production", "slo"],
"timezone": "UTC",
"refresh": "30s",
"time": {
"from": "now-6h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Request Rate (RPS)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"targets": [
{
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))",
"legendFormat": "Total RPS",
"refId": "A"
},
{
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m])) by (method)",
"legendFormat": "{{method}}",
"refId": "B"
}
],
"yaxes": [
{"format": "reqps", "label": "Requests/sec"},
{"format": "short"}
],
"legend": {"show": true, "alignAsTable": true, "avg": true, "max": true, "current": true}
},
{
"id": 2,
"title": "Error Rate (%)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"targets": [
{
"expr": "(sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))) * 100",
"legendFormat": "Error Rate %",
"refId": "A"
}
],
"yaxes": [
{"format": "percent", "label": "Error %", "max": 5},
{"format": "short"}
],
"alert": {
"name": "High Error Rate",
"conditions": [
{
"evaluator": {"params": [1], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A", "5m", "now"]},
"type": "query"
}
],
"frequency": "1m",
"for": "5m",
"message": "Error rate > 1% for 5 minutes",
"noDataState": "no_data",
"notifications": []
},
"thresholds": [
{"value": 1, "colorMode": "critical", "op": "gt", "fill": true, "line": true}
]
},
{
"id": 3,
"title": "Request Latency (p50/p95/p99)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
"legendFormat": "p99",
"refId": "C"
}
],
"yaxes": [
{"format": "ms", "label": "Latency (ms)"},
{"format": "short"}
],
"thresholds": [
{"value": 200, "colorMode": "warning", "op": "gt"},
{"value": 500, "colorMode": "critical", "op": "gt"}
]
},
{
"id": 4,
"title": "Resource Saturation (CPU/Memory %)",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
},
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "Memory %",
"refId": "B"
}
],
"yaxes": [
{"format": "percent", "label": "Usage %", "max": 100},
{"format": "short"}
],
"thresholds": [
{"value": 80, "colorMode": "warning", "op": "gt"},
{"value": 90, "colorMode": "critical", "op": "gt"}
]
},
{
"id": 5,
"title": "Top 10 Slowest Endpoints",
"type": "table",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"targets": [
{
"expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le, path))) * 1000",
"legendFormat": "",
"format": "table",
"instant": true,
"refId": "A"
}
],
"transformations": [
{"id": "organize", "options": {"excludeByName": {}, "indexByName": {}, "renameByName": {"path": "Endpoint", "Value": "p95 Latency (ms)"}}}
]
},
{
"id": 6,
"title": "SLO Status (30-day)",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
"targets": [
{
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])) * 100",
"refId": "A"
}
],
"options": {
"graphMode": "none",
"textMode": "value_and_name",
"colorMode": "background"
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 3,
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 99.5, "color": "yellow"},
{"value": 99.9, "color": "green"}
]
}
}
}
},
{
"id": 7,
"title": "Error Budget Remaining",
"type": "gauge",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
"targets": [
{
"expr": "(1 - ((1 - (sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])))) / (1 - 0.999))) * 100",
"refId": "A"
}
],
"options": {
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 25, "color": "yellow"},
{"value": 50, "color": "green"}
]
}
}
}
}
]
}
}