Initial commit
This commit is contained in:
438
skills/distributed-tracing/SKILL.md
Normal file
438
skills/distributed-tracing/SKILL.md
Normal file
@@ -0,0 +1,438 @@
|
||||
---
|
||||
name: distributed-tracing
|
||||
description: Implement distributed tracing with Jaeger and Tempo to track requests across microservices and identify performance bottlenecks. Use when debugging microservices, analyzing request flows, or implementing observability for distributed systems.
|
||||
---
|
||||
|
||||
# Distributed Tracing
|
||||
|
||||
Implement distributed tracing with Jaeger and Tempo for request flow visibility across microservices.
|
||||
|
||||
## Purpose
|
||||
|
||||
Track requests across distributed systems to understand latency, dependencies, and failure points.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Debug latency issues
|
||||
- Understand service dependencies
|
||||
- Identify bottlenecks
|
||||
- Trace error propagation
|
||||
- Analyze request paths
|
||||
|
||||
## Distributed Tracing Concepts
|
||||
|
||||
### Trace Structure
|
||||
```
|
||||
Trace (Request ID: abc123)
|
||||
↓
|
||||
Span (frontend) [100ms]
|
||||
↓
|
||||
Span (api-gateway) [80ms]
|
||||
├→ Span (auth-service) [10ms]
|
||||
└→ Span (user-service) [60ms]
|
||||
└→ Span (database) [40ms]
|
||||
```
|
||||
|
||||
### Key Components
|
||||
- **Trace** - End-to-end request journey
|
||||
- **Span** - Single operation within a trace
|
||||
- **Context** - Metadata propagated between services
|
||||
- **Tags** - Key-value pairs for filtering
|
||||
- **Logs** - Timestamped events within a span
|
||||
|
||||
## Jaeger Setup
|
||||
|
||||
### Kubernetes Deployment
|
||||
|
||||
```bash
|
||||
# Deploy Jaeger Operator
|
||||
kubectl create namespace observability
|
||||
kubectl create -f https://github.com/jaegertracing/jaeger-operator/releases/download/v1.51.0/jaeger-operator.yaml -n observability
|
||||
|
||||
# Deploy Jaeger instance
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: jaegertracing.io/v1
|
||||
kind: Jaeger
|
||||
metadata:
|
||||
name: jaeger
|
||||
namespace: observability
|
||||
spec:
|
||||
strategy: production
|
||||
storage:
|
||||
type: elasticsearch
|
||||
options:
|
||||
es:
|
||||
server-urls: http://elasticsearch:9200
|
||||
ingress:
|
||||
enabled: true
|
||||
EOF
|
||||
```
|
||||
|
||||
### Docker Compose
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
services:
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
ports:
|
||||
- "5775:5775/udp"
|
||||
- "6831:6831/udp"
|
||||
- "6832:6832/udp"
|
||||
- "5778:5778"
|
||||
- "16686:16686" # UI
|
||||
- "14268:14268" # Collector
|
||||
- "14250:14250" # gRPC
|
||||
- "9411:9411" # Zipkin
|
||||
environment:
|
||||
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
||||
```
|
||||
|
||||
**Reference:** See `references/jaeger-setup.md`
|
||||
|
||||
## Application Instrumentation
|
||||
|
||||
### OpenTelemetry (Recommended)
|
||||
|
||||
#### Python (Flask)
|
||||
```python
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
|
||||
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.instrumentation.flask import FlaskInstrumentor
|
||||
from flask import Flask
|
||||
|
||||
# Initialize tracer
|
||||
resource = Resource(attributes={SERVICE_NAME: "my-service"})
|
||||
provider = TracerProvider(resource=resource)
|
||||
processor = BatchSpanProcessor(JaegerExporter(
|
||||
agent_host_name="jaeger",
|
||||
agent_port=6831,
|
||||
))
|
||||
provider.add_span_processor(processor)
|
||||
trace.set_tracer_provider(provider)
|
||||
|
||||
# Instrument Flask
|
||||
app = Flask(__name__)
|
||||
FlaskInstrumentor().instrument_app(app)
|
||||
|
||||
@app.route('/api/users')
|
||||
def get_users():
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
with tracer.start_as_current_span("get_users") as span:
|
||||
span.set_attribute("user.count", 100)
|
||||
# Business logic
|
||||
users = fetch_users_from_db()
|
||||
return {"users": users}
|
||||
|
||||
def fetch_users_from_db():
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
with tracer.start_as_current_span("database_query") as span:
|
||||
span.set_attribute("db.system", "postgresql")
|
||||
span.set_attribute("db.statement", "SELECT * FROM users")
|
||||
# Database query
|
||||
return query_database()
|
||||
```
|
||||
|
||||
#### Node.js (Express)
|
||||
```javascript
|
||||
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
|
||||
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
|
||||
const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base');
|
||||
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
|
||||
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
|
||||
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
|
||||
|
||||
// Initialize tracer
|
||||
const provider = new NodeTracerProvider({
|
||||
resource: { attributes: { 'service.name': 'my-service' } }
|
||||
});
|
||||
|
||||
const exporter = new JaegerExporter({
|
||||
endpoint: 'http://jaeger:14268/api/traces'
|
||||
});
|
||||
|
||||
provider.addSpanProcessor(new BatchSpanProcessor(exporter));
|
||||
provider.register();
|
||||
|
||||
// Instrument libraries
|
||||
registerInstrumentations({
|
||||
instrumentations: [
|
||||
new HttpInstrumentation(),
|
||||
new ExpressInstrumentation(),
|
||||
],
|
||||
});
|
||||
|
||||
const express = require('express');
|
||||
const app = express();
|
||||
|
||||
app.get('/api/users', async (req, res) => {
|
||||
const tracer = trace.getTracer('my-service');
|
||||
const span = tracer.startSpan('get_users');
|
||||
|
||||
try {
|
||||
const users = await fetchUsers();
|
||||
span.setAttributes({ 'user.count': users.length });
|
||||
res.json({ users });
|
||||
} finally {
|
||||
span.end();
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
#### Go
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/exporters/jaeger"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
|
||||
)
|
||||
|
||||
func initTracer() (*sdktrace.TracerProvider, error) {
|
||||
exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(
|
||||
jaeger.WithEndpoint("http://jaeger:14268/api/traces"),
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tp := sdktrace.NewTracerProvider(
|
||||
sdktrace.WithBatcher(exporter),
|
||||
sdktrace.WithResource(resource.NewWithAttributes(
|
||||
semconv.SchemaURL,
|
||||
semconv.ServiceNameKey.String("my-service"),
|
||||
)),
|
||||
)
|
||||
|
||||
otel.SetTracerProvider(tp)
|
||||
return tp, nil
|
||||
}
|
||||
|
||||
func getUsers(ctx context.Context) ([]User, error) {
|
||||
tracer := otel.Tracer("my-service")
|
||||
ctx, span := tracer.Start(ctx, "get_users")
|
||||
defer span.End()
|
||||
|
||||
span.SetAttributes(attribute.String("user.filter", "active"))
|
||||
|
||||
users, err := fetchUsersFromDB(ctx)
|
||||
if err != nil {
|
||||
span.RecordError(err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
span.SetAttributes(attribute.Int("user.count", len(users)))
|
||||
return users, nil
|
||||
}
|
||||
```
|
||||
|
||||
**Reference:** See `references/instrumentation.md`
|
||||
|
||||
## Context Propagation
|
||||
|
||||
### HTTP Headers
|
||||
```
|
||||
traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01
|
||||
tracestate: congo=t61rcWkgMzE
|
||||
```
|
||||
|
||||
### Propagation in HTTP Requests
|
||||
|
||||
#### Python
|
||||
```python
|
||||
from opentelemetry.propagate import inject
|
||||
|
||||
headers = {}
|
||||
inject(headers) # Injects trace context
|
||||
|
||||
response = requests.get('http://downstream-service/api', headers=headers)
|
||||
```
|
||||
|
||||
#### Node.js
|
||||
```javascript
|
||||
const { propagation } = require('@opentelemetry/api');
|
||||
|
||||
const headers = {};
|
||||
propagation.inject(context.active(), headers);
|
||||
|
||||
axios.get('http://downstream-service/api', { headers });
|
||||
```
|
||||
|
||||
## Tempo Setup (Grafana)
|
||||
|
||||
### Kubernetes Deployment
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: tempo-config
|
||||
data:
|
||||
tempo.yaml: |
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
jaeger:
|
||||
protocols:
|
||||
thrift_http:
|
||||
grpc:
|
||||
otlp:
|
||||
protocols:
|
||||
http:
|
||||
grpc:
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: s3
|
||||
s3:
|
||||
bucket: tempo-traces
|
||||
endpoint: s3.amazonaws.com
|
||||
|
||||
querier:
|
||||
frontend_worker:
|
||||
frontend_address: tempo-query-frontend:9095
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: tempo
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: tempo
|
||||
image: grafana/tempo:latest
|
||||
args:
|
||||
- -config.file=/etc/tempo/tempo.yaml
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/tempo
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: tempo-config
|
||||
```
|
||||
|
||||
**Reference:** See `assets/jaeger-config.yaml.template`
|
||||
|
||||
## Sampling Strategies
|
||||
|
||||
### Probabilistic Sampling
|
||||
```yaml
|
||||
# Sample 1% of traces
|
||||
sampler:
|
||||
type: probabilistic
|
||||
param: 0.01
|
||||
```
|
||||
|
||||
### Rate Limiting Sampling
|
||||
```yaml
|
||||
# Sample max 100 traces per second
|
||||
sampler:
|
||||
type: ratelimiting
|
||||
param: 100
|
||||
```
|
||||
|
||||
### Adaptive Sampling
|
||||
```python
|
||||
from opentelemetry.sdk.trace.sampling import ParentBased, TraceIdRatioBased
|
||||
|
||||
# Sample based on trace ID (deterministic)
|
||||
sampler = ParentBased(root=TraceIdRatioBased(0.01))
|
||||
```
|
||||
|
||||
## Trace Analysis
|
||||
|
||||
### Finding Slow Requests
|
||||
|
||||
**Jaeger Query:**
|
||||
```
|
||||
service=my-service
|
||||
duration > 1s
|
||||
```
|
||||
|
||||
### Finding Errors
|
||||
|
||||
**Jaeger Query:**
|
||||
```
|
||||
service=my-service
|
||||
error=true
|
||||
tags.http.status_code >= 500
|
||||
```
|
||||
|
||||
### Service Dependency Graph
|
||||
|
||||
Jaeger automatically generates service dependency graphs showing:
|
||||
- Service relationships
|
||||
- Request rates
|
||||
- Error rates
|
||||
- Average latencies
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Sample appropriately** (1-10% in production)
|
||||
2. **Add meaningful tags** (user_id, request_id)
|
||||
3. **Propagate context** across all service boundaries
|
||||
4. **Log exceptions** in spans
|
||||
5. **Use consistent naming** for operations
|
||||
6. **Monitor tracing overhead** (<1% CPU impact)
|
||||
7. **Set up alerts** for trace errors
|
||||
8. **Implement distributed context** (baggage)
|
||||
9. **Use span events** for important milestones
|
||||
10. **Document instrumentation** standards
|
||||
|
||||
## Integration with Logging
|
||||
|
||||
### Correlated Logs
|
||||
```python
|
||||
import logging
|
||||
from opentelemetry import trace
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def process_request():
|
||||
span = trace.get_current_span()
|
||||
trace_id = span.get_span_context().trace_id
|
||||
|
||||
logger.info(
|
||||
"Processing request",
|
||||
extra={"trace_id": format(trace_id, '032x')}
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No traces appearing:**
|
||||
- Check collector endpoint
|
||||
- Verify network connectivity
|
||||
- Check sampling configuration
|
||||
- Review application logs
|
||||
|
||||
**High latency overhead:**
|
||||
- Reduce sampling rate
|
||||
- Use batch span processor
|
||||
- Check exporter configuration
|
||||
|
||||
## Reference Files
|
||||
|
||||
- `references/jaeger-setup.md` - Jaeger installation
|
||||
- `references/instrumentation.md` - Instrumentation patterns
|
||||
- `assets/jaeger-config.yaml.template` - Jaeger configuration
|
||||
|
||||
## Related Skills
|
||||
|
||||
- `prometheus-configuration` - For metrics
|
||||
- `grafana-dashboards` - For visualization
|
||||
- `slo-implementation` - For latency SLOs
|
||||
369
skills/grafana-dashboards/SKILL.md
Normal file
369
skills/grafana-dashboards/SKILL.md
Normal file
@@ -0,0 +1,369 @@
|
||||
---
|
||||
name: grafana-dashboards
|
||||
description: Create and manage production Grafana dashboards for real-time visualization of system and application metrics. Use when building monitoring dashboards, visualizing metrics, or creating operational observability interfaces.
|
||||
---
|
||||
|
||||
# Grafana Dashboards
|
||||
|
||||
Create and manage production-ready Grafana dashboards for comprehensive system observability.
|
||||
|
||||
## Purpose
|
||||
|
||||
Design effective Grafana dashboards for monitoring applications, infrastructure, and business metrics.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Visualize Prometheus metrics
|
||||
- Create custom dashboards
|
||||
- Implement SLO dashboards
|
||||
- Monitor infrastructure
|
||||
- Track business KPIs
|
||||
|
||||
## Dashboard Design Principles
|
||||
|
||||
### 1. Hierarchy of Information
|
||||
```
|
||||
┌─────────────────────────────────────┐
|
||||
│ Critical Metrics (Big Numbers) │
|
||||
├─────────────────────────────────────┤
|
||||
│ Key Trends (Time Series) │
|
||||
├─────────────────────────────────────┤
|
||||
│ Detailed Metrics (Tables/Heatmaps) │
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 2. RED Method (Services)
|
||||
- **Rate** - Requests per second
|
||||
- **Errors** - Error rate
|
||||
- **Duration** - Latency/response time
|
||||
|
||||
### 3. USE Method (Resources)
|
||||
- **Utilization** - % time resource is busy
|
||||
- **Saturation** - Queue length/wait time
|
||||
- **Errors** - Error count
|
||||
|
||||
## Dashboard Structure
|
||||
|
||||
### API Monitoring Dashboard
|
||||
|
||||
```json
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "API Monitoring",
|
||||
"tags": ["api", "production"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total[5m])) by (service)",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate %",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))) * 100",
|
||||
"legendFormat": "Error Rate"
|
||||
}
|
||||
],
|
||||
"alert": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {"params": [5], "type": "gt"},
|
||||
"operator": {"type": "and"},
|
||||
"query": {"params": ["A", "5m", "now"]},
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}
|
||||
},
|
||||
{
|
||||
"title": "P95 Latency",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Reference:** See `assets/api-dashboard.json`
|
||||
|
||||
## Panel Types
|
||||
|
||||
### 1. Stat Panel (Single Value)
|
||||
```json
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Total Requests",
|
||||
"targets": [{
|
||||
"expr": "sum(http_requests_total)"
|
||||
}],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["lastNotNull"]
|
||||
},
|
||||
"orientation": "auto",
|
||||
"textMode": "auto",
|
||||
"colorMode": "value"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "green"},
|
||||
{"value": 80, "color": "yellow"},
|
||||
{"value": 90, "color": "red"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Time Series Graph
|
||||
```json
|
||||
{
|
||||
"type": "graph",
|
||||
"title": "CPU Usage",
|
||||
"targets": [{
|
||||
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
|
||||
}],
|
||||
"yaxes": [
|
||||
{"format": "percent", "max": 100, "min": 0},
|
||||
{"format": "short"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Table Panel
|
||||
```json
|
||||
{
|
||||
"type": "table",
|
||||
"title": "Service Status",
|
||||
"targets": [{
|
||||
"expr": "up",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {"Time": true},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"instance": "Instance",
|
||||
"job": "Service",
|
||||
"Value": "Status"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Heatmap
|
||||
```json
|
||||
{
|
||||
"type": "heatmap",
|
||||
"title": "Latency Heatmap",
|
||||
"targets": [{
|
||||
"expr": "sum(rate(http_request_duration_seconds_bucket[5m])) by (le)",
|
||||
"format": "heatmap"
|
||||
}],
|
||||
"dataFormat": "tsbuckets",
|
||||
"yAxis": {
|
||||
"format": "s"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Variables
|
||||
|
||||
### Query Variables
|
||||
```json
|
||||
{
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": "Prometheus",
|
||||
"query": "label_values(kube_pod_info, namespace)",
|
||||
"refresh": 1,
|
||||
"multi": false
|
||||
},
|
||||
{
|
||||
"name": "service",
|
||||
"type": "query",
|
||||
"datasource": "Prometheus",
|
||||
"query": "label_values(kube_service_info{namespace=\"$namespace\"}, service)",
|
||||
"refresh": 1,
|
||||
"multi": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Use Variables in Queries
|
||||
```
|
||||
sum(rate(http_requests_total{namespace="$namespace", service=~"$service"}[5m]))
|
||||
```
|
||||
|
||||
## Alerts in Dashboards
|
||||
|
||||
```json
|
||||
{
|
||||
"alert": {
|
||||
"name": "High Error Rate",
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [5],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {"type": "and"},
|
||||
"query": {
|
||||
"params": ["A", "5m", "now"]
|
||||
},
|
||||
"reducer": {"type": "avg"},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"executionErrorState": "alerting",
|
||||
"for": "5m",
|
||||
"frequency": "1m",
|
||||
"message": "Error rate is above 5%",
|
||||
"noDataState": "no_data",
|
||||
"notifications": [
|
||||
{"uid": "slack-channel"}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Dashboard Provisioning
|
||||
|
||||
**dashboards.yml:**
|
||||
```yaml
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'General'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/dashboards
|
||||
```
|
||||
|
||||
## Common Dashboard Patterns
|
||||
|
||||
### Infrastructure Dashboard
|
||||
|
||||
**Key Panels:**
|
||||
- CPU utilization per node
|
||||
- Memory usage per node
|
||||
- Disk I/O
|
||||
- Network traffic
|
||||
- Pod count by namespace
|
||||
- Node status
|
||||
|
||||
**Reference:** See `assets/infrastructure-dashboard.json`
|
||||
|
||||
### Database Dashboard
|
||||
|
||||
**Key Panels:**
|
||||
- Queries per second
|
||||
- Connection pool usage
|
||||
- Query latency (P50, P95, P99)
|
||||
- Active connections
|
||||
- Database size
|
||||
- Replication lag
|
||||
- Slow queries
|
||||
|
||||
**Reference:** See `assets/database-dashboard.json`
|
||||
|
||||
### Application Dashboard
|
||||
|
||||
**Key Panels:**
|
||||
- Request rate
|
||||
- Error rate
|
||||
- Response time (percentiles)
|
||||
- Active users/sessions
|
||||
- Cache hit rate
|
||||
- Queue length
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start with templates** (Grafana community dashboards)
|
||||
2. **Use consistent naming** for panels and variables
|
||||
3. **Group related metrics** in rows
|
||||
4. **Set appropriate time ranges** (default: Last 6 hours)
|
||||
5. **Use variables** for flexibility
|
||||
6. **Add panel descriptions** for context
|
||||
7. **Configure units** correctly
|
||||
8. **Set meaningful thresholds** for colors
|
||||
9. **Use consistent colors** across dashboards
|
||||
10. **Test with different time ranges**
|
||||
|
||||
## Dashboard as Code
|
||||
|
||||
### Terraform Provisioning
|
||||
|
||||
```hcl
|
||||
resource "grafana_dashboard" "api_monitoring" {
|
||||
config_json = file("${path.module}/dashboards/api-monitoring.json")
|
||||
folder = grafana_folder.monitoring.id
|
||||
}
|
||||
|
||||
resource "grafana_folder" "monitoring" {
|
||||
title = "Production Monitoring"
|
||||
}
|
||||
```
|
||||
|
||||
### Ansible Provisioning
|
||||
|
||||
```yaml
|
||||
- name: Deploy Grafana dashboards
|
||||
copy:
|
||||
src: "{{ item }}"
|
||||
dest: /etc/grafana/dashboards/
|
||||
with_fileglob:
|
||||
- "dashboards/*.json"
|
||||
notify: restart grafana
|
||||
```
|
||||
|
||||
## Reference Files
|
||||
|
||||
- `assets/api-dashboard.json` - API monitoring dashboard
|
||||
- `assets/infrastructure-dashboard.json` - Infrastructure dashboard
|
||||
- `assets/database-dashboard.json` - Database monitoring dashboard
|
||||
- `references/dashboard-design.md` - Dashboard design guide
|
||||
|
||||
## Related Skills
|
||||
|
||||
- `prometheus-configuration` - For metric collection
|
||||
- `slo-implementation` - For SLO dashboards
|
||||
308
skills/hetzner-provisioner/README.md
Normal file
308
skills/hetzner-provisioner/README.md
Normal file
@@ -0,0 +1,308 @@
|
||||
**Name:** hetzner-provisioner
|
||||
**Type:** Infrastructure / DevOps
|
||||
**Model:** Claude Sonnet 4.5 (balanced for IaC generation)
|
||||
**Status:** Planned
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Automated Hetzner Cloud infrastructure provisioning using Terraform or Pulumi. Generates production-ready IaC code for deploying SaaS applications at $10-15/month instead of $50-100/month on Vercel/AWS.
|
||||
|
||||
## When This Skill Activates
|
||||
|
||||
**Keywords**: deploy on Hetzner, Hetzner Cloud, budget deployment, cheap hosting, $10/month, cost-effective infrastructure
|
||||
|
||||
**Example prompts**:
|
||||
- "Deploy my NextJS app on Hetzner"
|
||||
- "I want the cheapest possible hosting for my SaaS"
|
||||
- "Set up infrastructure on Hetzner Cloud with Postgres"
|
||||
- "Deploy for under $15/month"
|
||||
|
||||
## What It Generates
|
||||
|
||||
### 1. Terraform Configuration
|
||||
|
||||
**main.tf**:
|
||||
```hcl
|
||||
terraform {
|
||||
required_providers {
|
||||
hcloud = {
|
||||
source = "hetznercloud/hcloud"
|
||||
version = "~> 1.45"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
provider "hcloud" {
|
||||
token = var.hcloud_token
|
||||
}
|
||||
|
||||
# Server instance
|
||||
resource "hcloud_server" "app" {
|
||||
name = "my-saas-app"
|
||||
server_type = "cx11"
|
||||
image = "ubuntu-22.04"
|
||||
location = "nbg1" # Nuremberg, Germany
|
||||
|
||||
user_data = file("${path.module}/cloud-init.yaml")
|
||||
|
||||
public_net {
|
||||
ipv4_enabled = true
|
||||
ipv6_enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
# Managed Postgres database
|
||||
resource "hcloud_database" "postgres" {
|
||||
name = "my-saas-db"
|
||||
engine = "postgresql"
|
||||
version = "15"
|
||||
size = "db-1x-small"
|
||||
location = "nbg1"
|
||||
}
|
||||
|
||||
# Firewall
|
||||
resource "hcloud_firewall" "app" {
|
||||
name = "my-saas-firewall"
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "80"
|
||||
source_ips = ["0.0.0.0/0", "::/0"]
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "443"
|
||||
source_ips = ["0.0.0.0/0", "::/0"]
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "22"
|
||||
source_ips = ["0.0.0.0/0", "::/0"] # Restrict to your IP in production
|
||||
}
|
||||
}
|
||||
|
||||
# Apply firewall to server
|
||||
resource "hcloud_firewall_attachment" "app" {
|
||||
firewall_id = hcloud_firewall.app.id
|
||||
server_ids = [hcloud_server.app.id]
|
||||
}
|
||||
|
||||
# Output deployment info
|
||||
output "server_ip" {
|
||||
value = hcloud_server.app.ipv4_address
|
||||
}
|
||||
|
||||
output "database_host" {
|
||||
value = hcloud_database.postgres.host
|
||||
}
|
||||
|
||||
output "database_port" {
|
||||
value = hcloud_database.postgres.port
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Docker Configuration
|
||||
|
||||
**Dockerfile**:
|
||||
```dockerfile
|
||||
FROM node:20-alpine AS base
|
||||
|
||||
# Dependencies
|
||||
FROM base AS deps
|
||||
WORKDIR /app
|
||||
COPY package*.json ./
|
||||
RUN npm ci
|
||||
|
||||
# Builder
|
||||
FROM base AS builder
|
||||
WORKDIR /app
|
||||
COPY --from=deps /app/node_modules ./node_modules
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
|
||||
# Runner
|
||||
FROM base AS runner
|
||||
WORKDIR /app
|
||||
ENV NODE_ENV production
|
||||
|
||||
RUN addgroup --system --gid 1001 nodejs
|
||||
RUN adduser --system --uid 1001 nextjs
|
||||
|
||||
COPY --from=builder /app/public ./public
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
|
||||
|
||||
USER nextjs
|
||||
|
||||
EXPOSE 3000
|
||||
ENV PORT 3000
|
||||
|
||||
CMD ["node", "server.js"]
|
||||
```
|
||||
|
||||
### 3. GitHub Actions CI/CD
|
||||
|
||||
**.github/workflows/deploy.yml**:
|
||||
```yaml
|
||||
name: Deploy to Hetzner
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
|
||||
- name: Terraform Init
|
||||
run: terraform init
|
||||
working-directory: ./terraform
|
||||
env:
|
||||
HCLOUD_TOKEN: ${{ secrets.HETZNER_API_TOKEN }}
|
||||
|
||||
- name: Terraform Plan
|
||||
run: terraform plan
|
||||
working-directory: ./terraform
|
||||
env:
|
||||
HCLOUD_TOKEN: ${{ secrets.HETZNER_API_TOKEN }}
|
||||
|
||||
- name: Terraform Apply
|
||||
run: terraform apply -auto-approve
|
||||
working-directory: ./terraform
|
||||
env:
|
||||
HCLOUD_TOKEN: ${{ secrets.HETZNER_API_TOKEN }}
|
||||
|
||||
- name: Build and Deploy Docker
|
||||
run: |
|
||||
ssh ${{ secrets.SERVER_USER }}@${{ secrets.SERVER_IP }} << 'EOF'
|
||||
cd /app
|
||||
git pull
|
||||
docker-compose build
|
||||
docker-compose up -d
|
||||
EOF
|
||||
```
|
||||
|
||||
### 4. SSL Configuration (Let's Encrypt)
|
||||
|
||||
**nginx.conf** (auto-generated):
|
||||
```nginx
|
||||
server {
|
||||
listen 80;
|
||||
server_name your-domain.com;
|
||||
return 301 https://$server_name$request_uri;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 443 ssl http2;
|
||||
server_name your-domain.com;
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/your-domain.com/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/your-domain.com/privkey.pem;
|
||||
|
||||
location / {
|
||||
proxy_pass http://localhost:3000;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection 'upgrade';
|
||||
proxy_set_header Host $host;
|
||||
proxy_cache_bypass $http_upgrade;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Cost Breakdown
|
||||
|
||||
### Small SaaS (100-1000 users)
|
||||
- **CX11** (1 vCPU, 2GB RAM): $5.83/month
|
||||
- **Managed Postgres** (2GB): $5.00/month
|
||||
- **Storage** (20GB): $0.50/month
|
||||
- **SSL** (Let's Encrypt): Free
|
||||
- **Total**: ~$11.33/month
|
||||
|
||||
### Medium SaaS (1000-10000 users)
|
||||
- **CX21** (2 vCPU, 4GB RAM): $6.90/month
|
||||
- **Managed Postgres** (4GB): $10.00/month
|
||||
- **Storage** (40GB): $1.00/month
|
||||
- **Total**: ~$18/month
|
||||
|
||||
### Large SaaS (10000+ users)
|
||||
- **CX31** (2 vCPU, 8GB RAM): $14.28/month
|
||||
- **Managed Postgres** (8GB): $20.00/month
|
||||
- **Storage** (80GB): $2.00/month
|
||||
- **Total**: ~$36/month
|
||||
|
||||
## Test Cases
|
||||
|
||||
### Test 1: Basic Provision
|
||||
**File**: `test-cases/test-1-basic-provision.yaml`
|
||||
**Scenario**: Provision CX11 instance with Docker
|
||||
**Expected**: Terraform code generated, cost ~$6/month
|
||||
|
||||
### Test 2: Postgres Provision
|
||||
**File**: `test-cases/test-2-postgres-provision.yaml`
|
||||
**Scenario**: Add managed Postgres database
|
||||
**Expected**: Database resource added, cost ~$11/month
|
||||
|
||||
### Test 3: SSL Configuration
|
||||
**File**: `test-cases/test-3-ssl-config.yaml`
|
||||
**Scenario**: Configure SSL with Let's Encrypt
|
||||
**Expected**: Nginx + Certbot configuration, HTTPS working
|
||||
|
||||
## Verification Steps
|
||||
|
||||
See `test-results/README.md` for:
|
||||
1. How to run each test case
|
||||
2. Expected vs actual output
|
||||
3. Manual verification steps
|
||||
4. Screenshots of successful deployment
|
||||
|
||||
## Integration with Other Skills
|
||||
|
||||
- **cost-optimizer**: Recommends Hetzner when budget <$20/month
|
||||
- **devops-agent**: Provides strategic infrastructure planning
|
||||
- **nextjs-agent**: NextJS-specific deployment configuration
|
||||
- **nodejs-backend**: Node.js app deployment
|
||||
- **monitoring-setup**: Adds Uptime Kuma monitoring
|
||||
|
||||
## Limitations
|
||||
|
||||
- **EU-only**: Data centers in Germany/Finland (GDPR-friendly but not global)
|
||||
- **No auto-scaling**: Manual scaling only (upgrade instance type)
|
||||
- **Single-region**: Multi-region requires manual setup
|
||||
- **No serverless**: Traditional VM-based hosting
|
||||
|
||||
## Alternatives
|
||||
|
||||
When NOT to use Hetzner:
|
||||
- **Global audience**: Use Vercel (global edge network)
|
||||
- **Auto-scaling needed**: Use AWS/GCP
|
||||
- **Serverless preferred**: Use Vercel/Netlify
|
||||
- **Enterprise SLA required**: Use AWS/Azure with support plans
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- [ ] Kubernetes (k3s) cluster setup
|
||||
- [ ] Load balancer configuration
|
||||
- [ ] Multi-region deployment
|
||||
- [ ] Auto-scaling with Hetzner Cloud API
|
||||
- [ ] Monitoring integration (Grafana + Prometheus)
|
||||
- [ ] Disaster recovery automation
|
||||
|
||||
---
|
||||
|
||||
**Status**: Planned (Increment 003)
|
||||
**Priority**: P1
|
||||
**Tests**: 3+ test cases required
|
||||
**Documentation**: `.specweave/docs/guides/hetzner-deployment.md`
|
||||
251
skills/hetzner-provisioner/SKILL.md
Normal file
251
skills/hetzner-provisioner/SKILL.md
Normal file
@@ -0,0 +1,251 @@
|
||||
---
|
||||
name: hetzner-provisioner
|
||||
description: Provisions infrastructure on Hetzner Cloud with Terraform/Pulumi. Generates IaC code for CX11/CX21/CX31 instances, managed Postgres, SSL configuration, Docker deployment. Activates for deploy on Hetzner, Hetzner Cloud, budget deployment, cheap hosting, $10/month hosting.
|
||||
---
|
||||
|
||||
# Hetzner Cloud Provisioner
|
||||
|
||||
Automated infrastructure provisioning for Hetzner Cloud - the budget-friendly alternative to Vercel and AWS.
|
||||
|
||||
## Purpose
|
||||
|
||||
Generate and deploy infrastructure-as-code (Terraform/Pulumi) for Hetzner Cloud, enabling $10-15/month SaaS deployments instead of $50-100/month on other platforms.
|
||||
|
||||
## When to Use
|
||||
|
||||
Activates when user mentions:
|
||||
- "deploy on Hetzner"
|
||||
- "Hetzner Cloud"
|
||||
- "budget deployment"
|
||||
- "cheap hosting"
|
||||
- "deploy for $10/month"
|
||||
- "cost-effective infrastructure"
|
||||
|
||||
## What It Does
|
||||
|
||||
1. **Analyzes requirements**:
|
||||
- Application type (NextJS, Node.js, Python, etc.)
|
||||
- Database needs (Postgres, MySQL, Redis)
|
||||
- Expected traffic/users
|
||||
- Budget constraints
|
||||
|
||||
2. **Generates Infrastructure-as-Code**:
|
||||
- Terraform configuration for Hetzner Cloud
|
||||
- Alternative: Pulumi for TypeScript-native IaC
|
||||
- Server instances (CX11, CX21, CX31)
|
||||
- Managed databases (Postgres, MySQL)
|
||||
- Object storage (if needed)
|
||||
- Networking (firewall rules, floating IPs)
|
||||
|
||||
3. **Configures Production Setup**:
|
||||
- Docker containerization
|
||||
- SSL certificates (Let's Encrypt)
|
||||
- DNS configuration (Cloudflare or Hetzner DNS)
|
||||
- GitHub Actions CI/CD pipeline
|
||||
- Monitoring (Uptime Kuma, self-hosted)
|
||||
- Automated backups
|
||||
|
||||
4. **Outputs Deployment Guide**:
|
||||
- Step-by-step deployment instructions
|
||||
- Cost breakdown
|
||||
- Monitoring URLs
|
||||
- Troubleshooting guide
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ CRITICAL: Secrets Required (MANDATORY CHECK)
|
||||
|
||||
**BEFORE generating Terraform/Pulumi code, CHECK for Hetzner API token.**
|
||||
|
||||
### Step 1: Check If Token Exists
|
||||
|
||||
```bash
|
||||
# Check .env file
|
||||
if [ -f .env ] && grep -q "HETZNER_API_TOKEN" .env; then
|
||||
echo "✅ Hetzner API token found"
|
||||
else
|
||||
# Token NOT found - STOP and prompt user
|
||||
fi
|
||||
```
|
||||
|
||||
### Step 2: If Token Missing, STOP and Show This Message
|
||||
|
||||
```
|
||||
🔐 **Hetzner API Token Required**
|
||||
|
||||
I need your Hetzner API token to provision infrastructure.
|
||||
|
||||
**How to get it**:
|
||||
1. Go to: https://console.hetzner.cloud/
|
||||
2. Click on your project (or create one)
|
||||
3. Navigate to: Security → API Tokens
|
||||
4. Click "Generate API Token"
|
||||
5. Give it a name (e.g., "specweave-deployment")
|
||||
6. Permissions: **Read & Write**
|
||||
7. Click "Generate"
|
||||
8. **Copy the token immediately** (you can't see it again!)
|
||||
|
||||
**Where I'll save it**:
|
||||
- File: `.env` (gitignored, secure)
|
||||
- Format: `HETZNER_API_TOKEN=your-token-here`
|
||||
|
||||
**Security**:
|
||||
✅ .env is in .gitignore (never committed to git)
|
||||
✅ Token is 64 characters, alphanumeric
|
||||
✅ Stored locally only (not in source code)
|
||||
|
||||
Please paste your Hetzner API token:
|
||||
```
|
||||
|
||||
### Step 3: Validate Token Format
|
||||
|
||||
```bash
|
||||
# Hetzner tokens are 64 alphanumeric characters
|
||||
if [[ ! "$HETZNER_API_TOKEN" =~ ^[a-zA-Z0-9]{64}$ ]]; then
|
||||
echo "⚠️ Warning: Token format unexpected"
|
||||
echo "Expected: 64 alphanumeric characters"
|
||||
echo "Got: ${#HETZNER_API_TOKEN} characters"
|
||||
echo ""
|
||||
echo "This might not be a valid Hetzner API token."
|
||||
echo "Continue anyway? (yes/no)"
|
||||
fi
|
||||
```
|
||||
|
||||
### Step 4: Save Token Securely
|
||||
|
||||
```bash
|
||||
# Save to .env
|
||||
echo "HETZNER_API_TOKEN=$HETZNER_API_TOKEN" >> .env
|
||||
|
||||
# Ensure .env is gitignored
|
||||
if ! grep -q "^\.env$" .gitignore; then
|
||||
echo ".env" >> .gitignore
|
||||
fi
|
||||
|
||||
# Create .env.example for team
|
||||
cat > .env.example << 'EOF'
|
||||
# Hetzner Cloud API Token
|
||||
# Get from: https://console.hetzner.cloud/ → Security → API Tokens
|
||||
HETZNER_API_TOKEN=your-hetzner-token-here
|
||||
EOF
|
||||
|
||||
echo "✅ Token saved to .env (gitignored)"
|
||||
echo "✅ Created .env.example for team (commit this)"
|
||||
```
|
||||
|
||||
### Step 5: Use Token in Terraform
|
||||
|
||||
```hcl
|
||||
# terraform/variables.tf
|
||||
variable "hetzner_token" {
|
||||
description = "Hetzner Cloud API Token"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# terraform/provider.tf
|
||||
provider "hcloud" {
|
||||
token = var.hetzner_token
|
||||
}
|
||||
|
||||
# Run Terraform
|
||||
# export TF_VAR_hetzner_token=$HETZNER_API_TOKEN
|
||||
# terraform apply
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Example Usage
|
||||
|
||||
**User**: "Deploy my NextJS app on Hetzner with Postgres, as cheaply as possible"
|
||||
|
||||
**hetzner-provisioner**:
|
||||
```markdown
|
||||
I'll provision your NextJS app on Hetzner Cloud for ~$12/month.
|
||||
|
||||
Infrastructure:
|
||||
- CX11 instance (2 vCPU, 2GB RAM): $5.83/month
|
||||
- Managed Postgres (2GB): $5.00/month
|
||||
- 20GB storage: $0.50/month
|
||||
- SSL (Let's Encrypt): Free
|
||||
- Total: ~$11.33/month
|
||||
|
||||
Creating Terraform configuration...
|
||||
|
||||
✅ Generated files:
|
||||
- terraform/main.tf
|
||||
- terraform/variables.tf
|
||||
- terraform/outputs.tf
|
||||
- .github/workflows/deploy.yml
|
||||
|
||||
Next steps:
|
||||
1. Set HETZNER_API_TOKEN in GitHub secrets
|
||||
2. Push to GitHub
|
||||
3. GitHub Actions will deploy automatically
|
||||
|
||||
Deployment URL: https://your-app.yourdomain.com (after DNS configured)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Supports multiple instance types:
|
||||
- **CX11** (1 vCPU, 2GB RAM): $5.83/month - Small apps, 100-1000 users
|
||||
- **CX21** (2 vCPU, 4GB RAM): $6.90/month - Medium apps, 1000-10000 users
|
||||
- **CX31** (2 vCPU, 8GB RAM): $14.28/month - Larger apps, 10000+ users
|
||||
|
||||
Database options:
|
||||
- Managed Postgres (2GB): $5/month
|
||||
- Managed MySQL (2GB): $5/month
|
||||
- Self-hosted (included in instance cost)
|
||||
|
||||
## Test Cases
|
||||
|
||||
See `test-cases/` for validation scenarios:
|
||||
1. **test-1-basic-provision.yaml** - Basic CX11 instance
|
||||
2. **test-2-postgres-provision.yaml** - Add managed Postgres
|
||||
3. **test-3-ssl-config.yaml** - SSL and DNS configuration
|
||||
|
||||
## Cost Comparison
|
||||
|
||||
| Platform | Small App | Medium App | Large App |
|
||||
|----------|-----------|------------|-----------|
|
||||
| **Hetzner** | $12/mo | $15/mo | $25/mo |
|
||||
| Vercel | $60/mo | $120/mo | $240/mo |
|
||||
| AWS | $25/mo | $80/mo | $200/mo |
|
||||
| Railway | $20/mo | $50/mo | $100/mo |
|
||||
|
||||
**Savings**: 50-80% vs alternatives
|
||||
|
||||
## Technical Details
|
||||
|
||||
**Terraform Provider**: `hetznercloud/hcloud`
|
||||
**API**: Hetzner Cloud API v1
|
||||
**Regions**: Nuremberg, Falkenstein, Helsinki (Germany/Finland)
|
||||
**Deployment**: Docker + GitHub Actions
|
||||
**Monitoring**: Uptime Kuma (self-hosted, free)
|
||||
|
||||
## Integration
|
||||
|
||||
Works with:
|
||||
- `cost-optimizer` - Recommends Hetzner when budget-conscious
|
||||
- `devops-agent` - Strategic infrastructure planning
|
||||
- `nextjs-agent` - NextJS-specific deployment
|
||||
- Any backend framework (Node.js, Python, Go, etc.)
|
||||
|
||||
## Limitations
|
||||
|
||||
- EU-only data centers (GDPR-friendly)
|
||||
- Requires Hetzner Cloud account
|
||||
- Manual DNS configuration needed
|
||||
- Not suitable for multi-region deployments (use AWS/GCP for that)
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- Kubernetes support (k3s on Hetzner)
|
||||
- Load balancer configuration
|
||||
- Multi-region deployment
|
||||
- Disaster recovery setup
|
||||
|
||||
---
|
||||
|
||||
**For detailed usage**, see `README.md` and test cases in `test-cases/`
|
||||
392
skills/prometheus-configuration/SKILL.md
Normal file
392
skills/prometheus-configuration/SKILL.md
Normal file
@@ -0,0 +1,392 @@
|
||||
---
|
||||
name: prometheus-configuration
|
||||
description: Set up Prometheus for comprehensive metric collection, storage, and monitoring of infrastructure and applications. Use when implementing metrics collection, setting up monitoring infrastructure, or configuring alerting systems.
|
||||
---
|
||||
|
||||
# Prometheus Configuration
|
||||
|
||||
Complete guide to Prometheus setup, metric collection, scrape configuration, and recording rules.
|
||||
|
||||
## Purpose
|
||||
|
||||
Configure Prometheus for comprehensive metric collection, alerting, and monitoring of infrastructure and applications.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Set up Prometheus monitoring
|
||||
- Configure metric scraping
|
||||
- Create recording rules
|
||||
- Design alert rules
|
||||
- Implement service discovery
|
||||
|
||||
## Prometheus Architecture
|
||||
|
||||
```
|
||||
┌──────────────┐
|
||||
│ Applications │ ← Instrumented with client libraries
|
||||
└──────┬───────┘
|
||||
│ /metrics endpoint
|
||||
↓
|
||||
┌──────────────┐
|
||||
│ Prometheus │ ← Scrapes metrics periodically
|
||||
│ Server │
|
||||
└──────┬───────┘
|
||||
│
|
||||
├─→ AlertManager (alerts)
|
||||
├─→ Grafana (visualization)
|
||||
└─→ Long-term storage (Thanos/Cortex)
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
### Kubernetes with Helm
|
||||
|
||||
```bash
|
||||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
helm repo update
|
||||
|
||||
helm install prometheus prometheus-community/kube-prometheus-stack \
|
||||
--namespace monitoring \
|
||||
--create-namespace \
|
||||
--set prometheus.prometheusSpec.retention=30d \
|
||||
--set prometheus.prometheusSpec.storageVolumeSize=50Gi
|
||||
```
|
||||
|
||||
### Docker Compose
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
```
|
||||
|
||||
## Configuration File
|
||||
|
||||
**prometheus.yml:**
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'production'
|
||||
region: 'us-west-2'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load rules files
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Node exporters
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'node1:9100'
|
||||
- 'node2:9100'
|
||||
- 'node3:9100'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
regex: '([^:]+)(:[0-9]+)?'
|
||||
replacement: '${1}'
|
||||
|
||||
# Kubernetes pods with annotations
|
||||
- job_name: 'kubernetes-pods'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
action: replace
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
action: replace
|
||||
target_label: pod
|
||||
|
||||
# Application metrics
|
||||
- job_name: 'my-app'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'app1.example.com:9090'
|
||||
- 'app2.example.com:9090'
|
||||
metrics_path: '/metrics'
|
||||
scheme: 'https'
|
||||
tls_config:
|
||||
ca_file: /etc/prometheus/ca.crt
|
||||
cert_file: /etc/prometheus/client.crt
|
||||
key_file: /etc/prometheus/client.key
|
||||
```
|
||||
|
||||
**Reference:** See `assets/prometheus.yml.template`
|
||||
|
||||
## Scrape Configurations
|
||||
|
||||
### Static Targets
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'static-targets'
|
||||
static_configs:
|
||||
- targets: ['host1:9100', 'host2:9100']
|
||||
labels:
|
||||
env: 'production'
|
||||
region: 'us-west-2'
|
||||
```
|
||||
|
||||
### File-based Service Discovery
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'file-sd'
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- /etc/prometheus/targets/*.json
|
||||
- /etc/prometheus/targets/*.yml
|
||||
refresh_interval: 5m
|
||||
```
|
||||
|
||||
**targets/production.json:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"targets": ["app1:9090", "app2:9090"],
|
||||
"labels": {
|
||||
"env": "production",
|
||||
"service": "api"
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Kubernetes Service Discovery
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'kubernetes-services'
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
||||
action: replace
|
||||
target_label: __scheme__
|
||||
regex: (https?)
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
```
|
||||
|
||||
**Reference:** See `references/scrape-configs.md`
|
||||
|
||||
## Recording Rules
|
||||
|
||||
Create pre-computed metrics for frequently queried expressions:
|
||||
|
||||
```yaml
|
||||
# /etc/prometheus/rules/recording_rules.yml
|
||||
groups:
|
||||
- name: api_metrics
|
||||
interval: 15s
|
||||
rules:
|
||||
# HTTP request rate per service
|
||||
- record: job:http_requests:rate5m
|
||||
expr: sum by (job) (rate(http_requests_total[5m]))
|
||||
|
||||
# Error rate percentage
|
||||
- record: job:http_requests_errors:rate5m
|
||||
expr: sum by (job) (rate(http_requests_total{status=~"5.."}[5m]))
|
||||
|
||||
- record: job:http_requests_error_rate:percentage
|
||||
expr: |
|
||||
(job:http_requests_errors:rate5m / job:http_requests:rate5m) * 100
|
||||
|
||||
# P95 latency
|
||||
- record: job:http_request_duration:p95
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum by (job, le) (rate(http_request_duration_seconds_bucket[5m]))
|
||||
)
|
||||
|
||||
- name: resource_metrics
|
||||
interval: 30s
|
||||
rules:
|
||||
# CPU utilization percentage
|
||||
- record: instance:node_cpu:utilization
|
||||
expr: |
|
||||
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
|
||||
# Memory utilization percentage
|
||||
- record: instance:node_memory:utilization
|
||||
expr: |
|
||||
100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)
|
||||
|
||||
# Disk usage percentage
|
||||
- record: instance:node_disk:utilization
|
||||
expr: |
|
||||
100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
|
||||
```
|
||||
|
||||
**Reference:** See `references/recording-rules.md`
|
||||
|
||||
## Alert Rules
|
||||
|
||||
```yaml
|
||||
# /etc/prometheus/rules/alert_rules.yml
|
||||
groups:
|
||||
- name: availability
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job="my-app"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.instance }} is down"
|
||||
description: "{{ $labels.job }} has been down for more than 1 minute"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: job:http_requests_error_rate:percentage > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate for {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }}% (threshold: 5%)"
|
||||
|
||||
- alert: HighLatency
|
||||
expr: job:http_request_duration:p95 > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High latency for {{ $labels.job }}"
|
||||
description: "P95 latency is {{ $value }}s (threshold: 1s)"
|
||||
|
||||
- name: resources
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: instance:node_cpu:utilization > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: instance:node_memory:utilization > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: instance:node_disk:utilization > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Disk usage is {{ $value }}%"
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
```bash
|
||||
# Validate configuration
|
||||
promtool check config prometheus.yml
|
||||
|
||||
# Validate rules
|
||||
promtool check rules /etc/prometheus/rules/*.yml
|
||||
|
||||
# Test query
|
||||
promtool query instant http://localhost:9090 'up'
|
||||
```
|
||||
|
||||
**Reference:** See `scripts/validate-prometheus.sh`
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use consistent naming** for metrics (prefix_name_unit)
|
||||
2. **Set appropriate scrape intervals** (15-60s typical)
|
||||
3. **Use recording rules** for expensive queries
|
||||
4. **Implement high availability** (multiple Prometheus instances)
|
||||
5. **Configure retention** based on storage capacity
|
||||
6. **Use relabeling** for metric cleanup
|
||||
7. **Monitor Prometheus itself**
|
||||
8. **Implement federation** for large deployments
|
||||
9. **Use Thanos/Cortex** for long-term storage
|
||||
10. **Document custom metrics**
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Check scrape targets:**
|
||||
```bash
|
||||
curl http://localhost:9090/api/v1/targets
|
||||
```
|
||||
|
||||
**Check configuration:**
|
||||
```bash
|
||||
curl http://localhost:9090/api/v1/status/config
|
||||
```
|
||||
|
||||
**Test query:**
|
||||
```bash
|
||||
curl 'http://localhost:9090/api/v1/query?query=up'
|
||||
```
|
||||
|
||||
## Reference Files
|
||||
|
||||
- `assets/prometheus.yml.template` - Complete configuration template
|
||||
- `references/scrape-configs.md` - Scrape configuration patterns
|
||||
- `references/recording-rules.md` - Recording rule examples
|
||||
- `scripts/validate-prometheus.sh` - Validation script
|
||||
|
||||
## Related Skills
|
||||
|
||||
- `grafana-dashboards` - For visualization
|
||||
- `slo-implementation` - For SLO monitoring
|
||||
- `distributed-tracing` - For request tracing
|
||||
329
skills/slo-implementation/SKILL.md
Normal file
329
skills/slo-implementation/SKILL.md
Normal file
@@ -0,0 +1,329 @@
|
||||
---
|
||||
name: slo-implementation
|
||||
description: Define and implement Service Level Indicators (SLIs) and Service Level Objectives (SLOs) with error budgets and alerting. Use when establishing reliability targets, implementing SRE practices, or measuring service performance.
|
||||
---
|
||||
|
||||
# SLO Implementation
|
||||
|
||||
Framework for defining and implementing Service Level Indicators (SLIs), Service Level Objectives (SLOs), and error budgets.
|
||||
|
||||
## Purpose
|
||||
|
||||
Implement measurable reliability targets using SLIs, SLOs, and error budgets to balance reliability with innovation velocity.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Define service reliability targets
|
||||
- Measure user-perceived reliability
|
||||
- Implement error budgets
|
||||
- Create SLO-based alerts
|
||||
- Track reliability goals
|
||||
|
||||
## SLI/SLO/SLA Hierarchy
|
||||
|
||||
```
|
||||
SLA (Service Level Agreement)
|
||||
↓ Contract with customers
|
||||
SLO (Service Level Objective)
|
||||
↓ Internal reliability target
|
||||
SLI (Service Level Indicator)
|
||||
↓ Actual measurement
|
||||
```
|
||||
|
||||
## Defining SLIs
|
||||
|
||||
### Common SLI Types
|
||||
|
||||
#### 1. Availability SLI
|
||||
```promql
|
||||
# Successful requests / Total requests
|
||||
sum(rate(http_requests_total{status!~"5.."}[28d]))
|
||||
/
|
||||
sum(rate(http_requests_total[28d]))
|
||||
```
|
||||
|
||||
#### 2. Latency SLI
|
||||
```promql
|
||||
# Requests below latency threshold / Total requests
|
||||
sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
|
||||
/
|
||||
sum(rate(http_request_duration_seconds_count[28d]))
|
||||
```
|
||||
|
||||
#### 3. Durability SLI
|
||||
```
|
||||
# Successful writes / Total writes
|
||||
sum(storage_writes_successful_total)
|
||||
/
|
||||
sum(storage_writes_total)
|
||||
```
|
||||
|
||||
**Reference:** See `references/slo-definitions.md`
|
||||
|
||||
## Setting SLO Targets
|
||||
|
||||
### Availability SLO Examples
|
||||
|
||||
| SLO % | Downtime/Month | Downtime/Year |
|
||||
|-------|----------------|---------------|
|
||||
| 99% | 7.2 hours | 3.65 days |
|
||||
| 99.9% | 43.2 minutes | 8.76 hours |
|
||||
| 99.95%| 21.6 minutes | 4.38 hours |
|
||||
| 99.99%| 4.32 minutes | 52.56 minutes |
|
||||
|
||||
### Choose Appropriate SLOs
|
||||
|
||||
**Consider:**
|
||||
- User expectations
|
||||
- Business requirements
|
||||
- Current performance
|
||||
- Cost of reliability
|
||||
- Competitor benchmarks
|
||||
|
||||
**Example SLOs:**
|
||||
```yaml
|
||||
slos:
|
||||
- name: api_availability
|
||||
target: 99.9
|
||||
window: 28d
|
||||
sli: |
|
||||
sum(rate(http_requests_total{status!~"5.."}[28d]))
|
||||
/
|
||||
sum(rate(http_requests_total[28d]))
|
||||
|
||||
- name: api_latency_p95
|
||||
target: 99
|
||||
window: 28d
|
||||
sli: |
|
||||
sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
|
||||
/
|
||||
sum(rate(http_request_duration_seconds_count[28d]))
|
||||
```
|
||||
|
||||
## Error Budget Calculation
|
||||
|
||||
### Error Budget Formula
|
||||
|
||||
```
|
||||
Error Budget = 1 - SLO Target
|
||||
```
|
||||
|
||||
**Example:**
|
||||
- SLO: 99.9% availability
|
||||
- Error Budget: 0.1% = 43.2 minutes/month
|
||||
- Current Error: 0.05% = 21.6 minutes/month
|
||||
- Remaining Budget: 50%
|
||||
|
||||
### Error Budget Policy
|
||||
|
||||
```yaml
|
||||
error_budget_policy:
|
||||
- remaining_budget: 100%
|
||||
action: Normal development velocity
|
||||
- remaining_budget: 50%
|
||||
action: Consider postponing risky changes
|
||||
- remaining_budget: 10%
|
||||
action: Freeze non-critical changes
|
||||
- remaining_budget: 0%
|
||||
action: Feature freeze, focus on reliability
|
||||
```
|
||||
|
||||
**Reference:** See `references/error-budget.md`
|
||||
|
||||
## SLO Implementation
|
||||
|
||||
### Prometheus Recording Rules
|
||||
|
||||
```yaml
|
||||
# SLI Recording Rules
|
||||
groups:
|
||||
- name: sli_rules
|
||||
interval: 30s
|
||||
rules:
|
||||
# Availability SLI
|
||||
- record: sli:http_availability:ratio
|
||||
expr: |
|
||||
sum(rate(http_requests_total{status!~"5.."}[28d]))
|
||||
/
|
||||
sum(rate(http_requests_total[28d]))
|
||||
|
||||
# Latency SLI (requests < 500ms)
|
||||
- record: sli:http_latency:ratio
|
||||
expr: |
|
||||
sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
|
||||
/
|
||||
sum(rate(http_request_duration_seconds_count[28d]))
|
||||
|
||||
- name: slo_rules
|
||||
interval: 5m
|
||||
rules:
|
||||
# SLO compliance (1 = meeting SLO, 0 = violating)
|
||||
- record: slo:http_availability:compliance
|
||||
expr: sli:http_availability:ratio >= bool 0.999
|
||||
|
||||
- record: slo:http_latency:compliance
|
||||
expr: sli:http_latency:ratio >= bool 0.99
|
||||
|
||||
# Error budget remaining (percentage)
|
||||
- record: slo:http_availability:error_budget_remaining
|
||||
expr: |
|
||||
(sli:http_availability:ratio - 0.999) / (1 - 0.999) * 100
|
||||
|
||||
# Error budget burn rate
|
||||
- record: slo:http_availability:burn_rate_5m
|
||||
expr: |
|
||||
(1 - (
|
||||
sum(rate(http_requests_total{status!~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total[5m]))
|
||||
)) / (1 - 0.999)
|
||||
```
|
||||
|
||||
### SLO Alerting Rules
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: slo_alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
# Fast burn: 14.4x rate, 1 hour window
|
||||
# Consumes 2% error budget in 1 hour
|
||||
- alert: SLOErrorBudgetBurnFast
|
||||
expr: |
|
||||
slo:http_availability:burn_rate_1h > 14.4
|
||||
and
|
||||
slo:http_availability:burn_rate_5m > 14.4
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Fast error budget burn detected"
|
||||
description: "Error budget burning at {{ $value }}x rate"
|
||||
|
||||
# Slow burn: 6x rate, 6 hour window
|
||||
# Consumes 5% error budget in 6 hours
|
||||
- alert: SLOErrorBudgetBurnSlow
|
||||
expr: |
|
||||
slo:http_availability:burn_rate_6h > 6
|
||||
and
|
||||
slo:http_availability:burn_rate_30m > 6
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow error budget burn detected"
|
||||
description: "Error budget burning at {{ $value }}x rate"
|
||||
|
||||
# Error budget exhausted
|
||||
- alert: SLOErrorBudgetExhausted
|
||||
expr: slo:http_availability:error_budget_remaining < 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SLO error budget exhausted"
|
||||
description: "Error budget remaining: {{ $value }}%"
|
||||
```
|
||||
|
||||
## SLO Dashboard
|
||||
|
||||
**Grafana Dashboard Structure:**
|
||||
|
||||
```
|
||||
┌────────────────────────────────────┐
|
||||
│ SLO Compliance (Current) │
|
||||
│ ✓ 99.95% (Target: 99.9%) │
|
||||
├────────────────────────────────────┤
|
||||
│ Error Budget Remaining: 65% │
|
||||
│ ████████░░ 65% │
|
||||
├────────────────────────────────────┤
|
||||
│ SLI Trend (28 days) │
|
||||
│ [Time series graph] │
|
||||
├────────────────────────────────────┤
|
||||
│ Burn Rate Analysis │
|
||||
│ [Burn rate by time window] │
|
||||
└────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Example Queries:**
|
||||
|
||||
```promql
|
||||
# Current SLO compliance
|
||||
sli:http_availability:ratio * 100
|
||||
|
||||
# Error budget remaining
|
||||
slo:http_availability:error_budget_remaining
|
||||
|
||||
# Days until error budget exhausted (at current burn rate)
|
||||
(slo:http_availability:error_budget_remaining / 100)
|
||||
*
|
||||
28
|
||||
/
|
||||
(1 - sli:http_availability:ratio) * (1 - 0.999)
|
||||
```
|
||||
|
||||
## Multi-Window Burn Rate Alerts
|
||||
|
||||
```yaml
|
||||
# Combination of short and long windows reduces false positives
|
||||
rules:
|
||||
- alert: SLOBurnRateHigh
|
||||
expr: |
|
||||
(
|
||||
slo:http_availability:burn_rate_1h > 14.4
|
||||
and
|
||||
slo:http_availability:burn_rate_5m > 14.4
|
||||
)
|
||||
or
|
||||
(
|
||||
slo:http_availability:burn_rate_6h > 6
|
||||
and
|
||||
slo:http_availability:burn_rate_30m > 6
|
||||
)
|
||||
labels:
|
||||
severity: critical
|
||||
```
|
||||
|
||||
## SLO Review Process
|
||||
|
||||
### Weekly Review
|
||||
- Current SLO compliance
|
||||
- Error budget status
|
||||
- Trend analysis
|
||||
- Incident impact
|
||||
|
||||
### Monthly Review
|
||||
- SLO achievement
|
||||
- Error budget usage
|
||||
- Incident postmortems
|
||||
- SLO adjustments
|
||||
|
||||
### Quarterly Review
|
||||
- SLO relevance
|
||||
- Target adjustments
|
||||
- Process improvements
|
||||
- Tooling enhancements
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start with user-facing services**
|
||||
2. **Use multiple SLIs** (availability, latency, etc.)
|
||||
3. **Set achievable SLOs** (don't aim for 100%)
|
||||
4. **Implement multi-window alerts** to reduce noise
|
||||
5. **Track error budget** consistently
|
||||
6. **Review SLOs regularly**
|
||||
7. **Document SLO decisions**
|
||||
8. **Align with business goals**
|
||||
9. **Automate SLO reporting**
|
||||
10. **Use SLOs for prioritization**
|
||||
|
||||
## Reference Files
|
||||
|
||||
- `assets/slo-template.md` - SLO definition template
|
||||
- `references/slo-definitions.md` - SLO definition patterns
|
||||
- `references/error-budget.md` - Error budget calculations
|
||||
|
||||
## Related Skills
|
||||
|
||||
- `prometheus-configuration` - For metric collection
|
||||
- `grafana-dashboards` - For SLO visualization
|
||||
Reference in New Issue
Block a user