211 lines
6.6 KiB
JSON
211 lines
6.6 KiB
JSON
{
|
|
"dashboard": {
|
|
"title": "Golden Signals - [Service Name]",
|
|
"tags": ["golden-signals", "production", "slo"],
|
|
"timezone": "UTC",
|
|
"refresh": "30s",
|
|
"time": {
|
|
"from": "now-6h",
|
|
"to": "now"
|
|
},
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Request Rate (RPS)",
|
|
"type": "graph",
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))",
|
|
"legendFormat": "Total RPS",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m])) by (method)",
|
|
"legendFormat": "{{method}}",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "reqps", "label": "Requests/sec"},
|
|
{"format": "short"}
|
|
],
|
|
"legend": {"show": true, "alignAsTable": true, "avg": true, "max": true, "current": true}
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Error Rate (%)",
|
|
"type": "graph",
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
|
"targets": [
|
|
{
|
|
"expr": "(sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[5m]))) * 100",
|
|
"legendFormat": "Error Rate %",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "percent", "label": "Error %", "max": 5},
|
|
{"format": "short"}
|
|
],
|
|
"alert": {
|
|
"name": "High Error Rate",
|
|
"conditions": [
|
|
{
|
|
"evaluator": {"params": [1], "type": "gt"},
|
|
"operator": {"type": "and"},
|
|
"query": {"params": ["A", "5m", "now"]},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"frequency": "1m",
|
|
"for": "5m",
|
|
"message": "Error rate > 1% for 5 minutes",
|
|
"noDataState": "no_data",
|
|
"notifications": []
|
|
},
|
|
"thresholds": [
|
|
{"value": 1, "colorMode": "critical", "op": "gt", "fill": true, "line": true}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Request Latency (p50/p95/p99)",
|
|
"type": "graph",
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
|
|
"legendFormat": "p50",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
|
|
"legendFormat": "p95",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le)) * 1000",
|
|
"legendFormat": "p99",
|
|
"refId": "C"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "ms", "label": "Latency (ms)"},
|
|
{"format": "short"}
|
|
],
|
|
"thresholds": [
|
|
{"value": 200, "colorMode": "warning", "op": "gt"},
|
|
{"value": 500, "colorMode": "critical", "op": "gt"}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Resource Saturation (CPU/Memory %)",
|
|
"type": "graph",
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
|
"legendFormat": "Memory %",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "percent", "label": "Usage %", "max": 100},
|
|
{"format": "short"}
|
|
],
|
|
"thresholds": [
|
|
{"value": 80, "colorMode": "warning", "op": "gt"},
|
|
{"value": 90, "colorMode": "critical", "op": "gt"}
|
|
]
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Top 10 Slowest Endpoints",
|
|
"type": "table",
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
|
"targets": [
|
|
{
|
|
"expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"YOUR_SERVICE\"}[5m])) by (le, path))) * 1000",
|
|
"legendFormat": "",
|
|
"format": "table",
|
|
"instant": true,
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"transformations": [
|
|
{"id": "organize", "options": {"excludeByName": {}, "indexByName": {}, "renameByName": {"path": "Endpoint", "Value": "p95 Latency (ms)"}}}
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "SLO Status (30-day)",
|
|
"type": "stat",
|
|
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])) * 100",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"options": {
|
|
"graphMode": "none",
|
|
"textMode": "value_and_name",
|
|
"colorMode": "background"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"decimals": 3,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"value": 0, "color": "red"},
|
|
{"value": 99.5, "color": "yellow"},
|
|
{"value": 99.9, "color": "green"}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "Error Budget Remaining",
|
|
"type": "gauge",
|
|
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - ((1 - (sum(rate(http_requests_total{service=\"YOUR_SERVICE\",status=~\"2..|3..\"}[30d])) / sum(rate(http_requests_total{service=\"YOUR_SERVICE\"}[30d])))) / (1 - 0.999))) * 100",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"options": {
|
|
"showThresholdLabels": false,
|
|
"showThresholdMarkers": true
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"min": 0,
|
|
"max": 100,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"value": 0, "color": "red"},
|
|
{"value": 25, "color": "yellow"},
|
|
{"value": 50, "color": "green"}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|