Initial commit
This commit is contained in:
227
assets/templates/otel-config/collector-config.yaml
Normal file
227
assets/templates/otel-config/collector-config.yaml
Normal file
@@ -0,0 +1,227 @@
|
||||
# OpenTelemetry Collector Configuration
|
||||
# Receives metrics, logs, and traces and exports to various backends
|
||||
|
||||
receivers:
|
||||
# OTLP receiver (standard OpenTelemetry protocol)
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
# Prometheus receiver (scrape Prometheus endpoints)
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'otel-collector'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['localhost:8888']
|
||||
|
||||
# Host metrics (CPU, memory, disk, network)
|
||||
hostmetrics:
|
||||
collection_interval: 30s
|
||||
scrapers:
|
||||
cpu:
|
||||
memory:
|
||||
disk:
|
||||
network:
|
||||
filesystem:
|
||||
load:
|
||||
|
||||
# Kubernetes receiver (cluster metrics)
|
||||
k8s_cluster:
|
||||
auth_type: serviceAccount
|
||||
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure]
|
||||
distribution: kubernetes
|
||||
|
||||
# Zipkin receiver (legacy tracing)
|
||||
zipkin:
|
||||
endpoint: 0.0.0.0:9411
|
||||
|
||||
processors:
|
||||
# Batch processor (improves performance)
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
send_batch_max_size: 2048
|
||||
|
||||
# Memory limiter (prevent OOM)
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 512
|
||||
spike_limit_mib: 128
|
||||
|
||||
# Resource processor (add resource attributes)
|
||||
resource:
|
||||
attributes:
|
||||
- key: environment
|
||||
value: production
|
||||
action: insert
|
||||
- key: cluster.name
|
||||
value: prod-cluster
|
||||
action: insert
|
||||
|
||||
# Attributes processor (modify span/metric attributes)
|
||||
attributes:
|
||||
actions:
|
||||
- key: http.url
|
||||
action: delete # Remove potentially sensitive URLs
|
||||
- key: db.statement
|
||||
action: hash # Hash SQL queries for privacy
|
||||
|
||||
# Filter processor (drop unwanted data)
|
||||
filter:
|
||||
metrics:
|
||||
# Drop metrics matching criteria
|
||||
exclude:
|
||||
match_type: regexp
|
||||
metric_names:
|
||||
- ^go_.* # Drop Go runtime metrics
|
||||
- ^process_.* # Drop process metrics
|
||||
|
||||
# Tail sampling (intelligent trace sampling)
|
||||
tail_sampling:
|
||||
decision_wait: 10s
|
||||
num_traces: 100
|
||||
policies:
|
||||
# Always sample errors
|
||||
- name: error-policy
|
||||
type: status_code
|
||||
status_code:
|
||||
status_codes: [ERROR]
|
||||
|
||||
# Sample slow traces
|
||||
- name: latency-policy
|
||||
type: latency
|
||||
latency:
|
||||
threshold_ms: 1000
|
||||
|
||||
# Sample 10% of others
|
||||
- name: probabilistic-policy
|
||||
type: probabilistic
|
||||
probabilistic:
|
||||
sampling_percentage: 10
|
||||
|
||||
# Span processor (modify spans)
|
||||
span:
|
||||
name:
|
||||
to_attributes:
|
||||
rules:
|
||||
- ^\/api\/v1\/users\/(?P<user_id>.*)$
|
||||
from_attributes:
|
||||
- db.name
|
||||
- http.method
|
||||
|
||||
exporters:
|
||||
# Prometheus exporter (expose metrics endpoint)
|
||||
prometheus:
|
||||
endpoint: 0.0.0.0:8889
|
||||
namespace: otel
|
||||
|
||||
# OTLP exporters (send to backends)
|
||||
otlp/tempo:
|
||||
endpoint: tempo:4317
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
otlp/mimir:
|
||||
endpoint: mimir:4317
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# Loki exporter (for logs)
|
||||
loki:
|
||||
endpoint: http://loki:3100/loki/api/v1/push
|
||||
labels:
|
||||
resource:
|
||||
service.name: "service_name"
|
||||
service.namespace: "service_namespace"
|
||||
attributes:
|
||||
level: "level"
|
||||
|
||||
# Jaeger exporter (alternative tracing backend)
|
||||
jaeger:
|
||||
endpoint: jaeger:14250
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# Elasticsearch exporter (for logs)
|
||||
elasticsearch:
|
||||
endpoints:
|
||||
- http://elasticsearch:9200
|
||||
logs_index: otel-logs
|
||||
traces_index: otel-traces
|
||||
|
||||
# CloudWatch exporter (AWS)
|
||||
awscloudwatch:
|
||||
region: us-east-1
|
||||
namespace: MyApp
|
||||
log_group_name: /aws/otel/logs
|
||||
log_stream_name: otel-collector
|
||||
|
||||
# Datadog exporter
|
||||
datadog:
|
||||
api:
|
||||
key: ${DD_API_KEY}
|
||||
site: datadoghq.com
|
||||
|
||||
# File exporter (debugging)
|
||||
file:
|
||||
path: /tmp/otel-output.json
|
||||
|
||||
# Logging exporter (console output for debugging)
|
||||
logging:
|
||||
verbosity: detailed
|
||||
sampling_initial: 5
|
||||
sampling_thereafter: 200
|
||||
|
||||
extensions:
|
||||
# Health check endpoint
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
|
||||
# Pprof endpoint (for profiling)
|
||||
pprof:
|
||||
endpoint: 0.0.0.0:1777
|
||||
|
||||
# ZPages (internal diagnostics)
|
||||
zpages:
|
||||
endpoint: 0.0.0.0:55679
|
||||
|
||||
service:
|
||||
extensions: [health_check, pprof, zpages]
|
||||
|
||||
pipelines:
|
||||
# Traces pipeline
|
||||
traces:
|
||||
receivers: [otlp, zipkin]
|
||||
processors: [memory_limiter, batch, tail_sampling, resource, span]
|
||||
exporters: [otlp/tempo, jaeger, logging]
|
||||
|
||||
# Metrics pipeline
|
||||
metrics:
|
||||
receivers: [otlp, prometheus, hostmetrics, k8s_cluster]
|
||||
processors: [memory_limiter, batch, filter, resource]
|
||||
exporters: [otlp/mimir, prometheus, awscloudwatch]
|
||||
|
||||
# Logs pipeline
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resource, attributes]
|
||||
exporters: [loki, elasticsearch, awscloudwatch]
|
||||
|
||||
# Telemetry (collector's own metrics)
|
||||
telemetry:
|
||||
logs:
|
||||
level: info
|
||||
metrics:
|
||||
address: 0.0.0.0:8888
|
||||
|
||||
# Notes:
|
||||
# 1. Replace ${DD_API_KEY} with actual API key or use environment variable
|
||||
# 2. Adjust endpoints to match your infrastructure
|
||||
# 3. Comment out exporters you don't use
|
||||
# 4. Adjust sampling rates based on your volume and needs
|
||||
# 5. Add TLS configuration for production deployments
|
||||
293
assets/templates/prometheus-alerts/kubernetes-alerts.yml
Normal file
293
assets/templates/prometheus-alerts/kubernetes-alerts.yml
Normal file
@@ -0,0 +1,293 @@
|
||||
---
|
||||
# Prometheus Alert Rules for Kubernetes
|
||||
# Covers pods, nodes, deployments, and resource usage
|
||||
|
||||
groups:
|
||||
- name: kubernetes_pods
|
||||
interval: 30s
|
||||
rules:
|
||||
# Pod crash looping
|
||||
- alert: PodCrashLooping
|
||||
expr: |
|
||||
rate(kube_pod_container_status_restarts_total[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Pod is crash looping - {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
description: |
|
||||
Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes.
|
||||
|
||||
Check pod logs:
|
||||
kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} --previous
|
||||
runbook_url: "https://runbooks.example.com/pod-crash-loop"
|
||||
|
||||
# Pod not ready
|
||||
- alert: PodNotReady
|
||||
expr: |
|
||||
sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Pod not ready - {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
description: |
|
||||
Pod {{ $labels.namespace }}/{{ $labels.pod }} is in {{ $labels.phase }} state for 10 minutes.
|
||||
|
||||
Investigate:
|
||||
kubectl describe pod -n {{ $labels.namespace }} {{ $labels.pod }}
|
||||
runbook_url: "https://runbooks.example.com/pod-not-ready"
|
||||
|
||||
# Pod OOMKilled
|
||||
- alert: PodOOMKilled
|
||||
expr: |
|
||||
sum by (namespace, pod) (kube_pod_container_status_terminated_reason{reason="OOMKilled"}) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Pod killed due to OOM - {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
description: |
|
||||
Pod {{ $labels.namespace }}/{{ $labels.pod }} was killed due to out-of-memory.
|
||||
|
||||
Increase memory limits or investigate memory leak.
|
||||
runbook_url: "https://runbooks.example.com/oom-killed"
|
||||
|
||||
- name: kubernetes_deployments
|
||||
interval: 30s
|
||||
rules:
|
||||
# Deployment replica mismatch
|
||||
- alert: DeploymentReplicasMismatch
|
||||
expr: |
|
||||
kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Deployment replicas mismatch - {{ $labels.namespace }}/{{ $labels.deployment }}"
|
||||
description: |
|
||||
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has been running with
|
||||
fewer replicas than desired for 15 minutes.
|
||||
|
||||
Desired: {{ $value }}
|
||||
Available: Check deployment status
|
||||
runbook_url: "https://runbooks.example.com/replica-mismatch"
|
||||
|
||||
# Deployment rollout stuck
|
||||
- alert: DeploymentRolloutStuck
|
||||
expr: |
|
||||
kube_deployment_status_condition{condition="Progressing", status="false"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Deployment rollout stuck - {{ $labels.namespace }}/{{ $labels.deployment }}"
|
||||
description: |
|
||||
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} rollout is stuck.
|
||||
|
||||
Check rollout status:
|
||||
kubectl rollout status deployment/{{ $labels.deployment }} -n {{ $labels.namespace }}
|
||||
runbook_url: "https://runbooks.example.com/rollout-stuck"
|
||||
|
||||
- name: kubernetes_nodes
|
||||
interval: 30s
|
||||
rules:
|
||||
# Node not ready
|
||||
- alert: NodeNotReady
|
||||
expr: |
|
||||
kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Node not ready - {{ $labels.node }}"
|
||||
description: |
|
||||
Node {{ $labels.node }} has been NotReady for 5 minutes.
|
||||
|
||||
This will affect pod scheduling and availability.
|
||||
|
||||
Check node status:
|
||||
kubectl describe node {{ $labels.node }}
|
||||
runbook_url: "https://runbooks.example.com/node-not-ready"
|
||||
|
||||
# Node memory pressure
|
||||
- alert: NodeMemoryPressure
|
||||
expr: |
|
||||
kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Node under memory pressure - {{ $labels.node }}"
|
||||
description: |
|
||||
Node {{ $labels.node }} is experiencing memory pressure.
|
||||
|
||||
Pods may be evicted. Consider scaling up or evicting low-priority pods.
|
||||
runbook_url: "https://runbooks.example.com/memory-pressure"
|
||||
|
||||
# Node disk pressure
|
||||
- alert: NodeDiskPressure
|
||||
expr: |
|
||||
kube_node_status_condition{condition="DiskPressure",status="true"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Node under disk pressure - {{ $labels.node }}"
|
||||
description: |
|
||||
Node {{ $labels.node }} is experiencing disk pressure.
|
||||
|
||||
Clean up disk space or add capacity.
|
||||
runbook_url: "https://runbooks.example.com/disk-pressure"
|
||||
|
||||
# Node high CPU
|
||||
- alert: NodeHighCPU
|
||||
expr: |
|
||||
(1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) * 100 > 80
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Node high CPU usage - {{ $labels.instance }}"
|
||||
description: |
|
||||
Node {{ $labels.instance }} CPU usage is {{ $value | humanize }}%.
|
||||
|
||||
Check for resource-intensive pods or scale cluster.
|
||||
runbook_url: "https://runbooks.example.com/node-high-cpu"
|
||||
|
||||
- name: kubernetes_resources
|
||||
interval: 30s
|
||||
rules:
|
||||
# Container CPU throttling
|
||||
- alert: ContainerCPUThrottling
|
||||
expr: |
|
||||
rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Container CPU throttling - {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
description: |
|
||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
is being CPU throttled.
|
||||
|
||||
CPU throttling rate: {{ $value | humanize }}
|
||||
|
||||
Consider increasing CPU limits.
|
||||
runbook_url: "https://runbooks.example.com/cpu-throttling"
|
||||
|
||||
# Container memory usage high
|
||||
- alert: ContainerMemoryUsageHigh
|
||||
expr: |
|
||||
(container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Container memory usage high - {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
description: |
|
||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
is using {{ $value | humanizePercentage }} of its memory limit.
|
||||
|
||||
Risk of OOMKill. Consider increasing memory limits.
|
||||
runbook_url: "https://runbooks.example.com/high-memory"
|
||||
|
||||
- name: kubernetes_pv
|
||||
interval: 30s
|
||||
rules:
|
||||
# PersistentVolume nearing full
|
||||
- alert: PersistentVolumeFillingUp
|
||||
expr: |
|
||||
(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "PersistentVolume filling up - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
|
||||
description: |
|
||||
PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
|
||||
is {{ $value | humanizePercentage }} full.
|
||||
|
||||
Available space is running low. Consider expanding volume.
|
||||
runbook_url: "https://runbooks.example.com/pv-filling-up"
|
||||
|
||||
# PersistentVolume critically full
|
||||
- alert: PersistentVolumeCriticallyFull
|
||||
expr: |
|
||||
(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "PersistentVolume critically full - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
|
||||
description: |
|
||||
PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
|
||||
is {{ $value | humanizePercentage }} full.
|
||||
|
||||
Immediate action required to prevent application failures.
|
||||
runbook_url: "https://runbooks.example.com/pv-critically-full"
|
||||
|
||||
- name: kubernetes_jobs
|
||||
interval: 30s
|
||||
rules:
|
||||
# Job failed
|
||||
- alert: JobFailed
|
||||
expr: |
|
||||
kube_job_status_failed > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "Job failed - {{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||
description: |
|
||||
Job {{ $labels.namespace }}/{{ $labels.job_name }} has failed.
|
||||
|
||||
Check job logs:
|
||||
kubectl logs job/{{ $labels.job_name }} -n {{ $labels.namespace }}
|
||||
runbook_url: "https://runbooks.example.com/job-failed"
|
||||
|
||||
# CronJob not running
|
||||
- alert: CronJobNotRunning
|
||||
expr: |
|
||||
time() - kube_cronjob_status_last_schedule_time > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
component: kubernetes
|
||||
annotations:
|
||||
summary: "CronJob not running - {{ $labels.namespace }}/{{ $labels.cronjob }}"
|
||||
description: |
|
||||
CronJob {{ $labels.namespace}}/{{ $labels.cronjob }} hasn't run in over an hour.
|
||||
|
||||
Check CronJob status:
|
||||
kubectl describe cronjob {{ $labels.cronjob }} -n {{ $labels.namespace }}
|
||||
runbook_url: "https://runbooks.example.com/cronjob-not-running"
|
||||
243
assets/templates/prometheus-alerts/webapp-alerts.yml
Normal file
243
assets/templates/prometheus-alerts/webapp-alerts.yml
Normal file
@@ -0,0 +1,243 @@
|
||||
---
|
||||
# Prometheus Alert Rules for Web Applications
|
||||
# Based on SLO best practices and multi-window burn rate alerting
|
||||
|
||||
groups:
|
||||
- name: webapp_availability
|
||||
interval: 30s
|
||||
rules:
|
||||
# Fast burn rate alert (1h window) - SLO: 99.9%
|
||||
- alert: ErrorBudgetFastBurn
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="webapp"}[1h]))
|
||||
) > (14.4 * 0.001)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "Fast error budget burn - {{ $labels.job }}"
|
||||
description: |
|
||||
Error rate is {{ $value | humanizePercentage }} over the last hour,
|
||||
burning through error budget at 14.4x rate.
|
||||
|
||||
At this rate, the monthly error budget will be exhausted in 2 days.
|
||||
|
||||
Immediate investigation required.
|
||||
runbook_url: "https://runbooks.example.com/error-budget-burn"
|
||||
dashboard: "https://grafana.example.com/d/webapp"
|
||||
|
||||
# Slow burn rate alert (6h window)
|
||||
- alert: ErrorBudgetSlowBurn
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="webapp"}[6h]))
|
||||
) > (6 * 0.001)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "Elevated error budget burn - {{ $labels.job }}"
|
||||
description: |
|
||||
Error rate is {{ $value | humanizePercentage }} over the last 6 hours,
|
||||
burning through error budget at 6x rate.
|
||||
|
||||
Monitor closely and investigate if trend continues.
|
||||
runbook_url: "https://runbooks.example.com/error-budget-burn"
|
||||
|
||||
# Service down alert
|
||||
- alert: WebAppDown
|
||||
expr: up{job="webapp"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "Web application is down - {{ $labels.instance }}"
|
||||
description: |
|
||||
Web application instance {{ $labels.instance }} has been down for 2 minutes.
|
||||
|
||||
Check service health and logs immediately.
|
||||
runbook_url: "https://runbooks.example.com/service-down"
|
||||
|
||||
- name: webapp_latency
|
||||
interval: 30s
|
||||
rules:
|
||||
# High latency (p95)
|
||||
- alert: HighLatencyP95
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
|
||||
) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "High p95 latency - {{ $labels.job }}"
|
||||
description: |
|
||||
P95 request latency is {{ $value }}s, exceeding 500ms threshold.
|
||||
|
||||
This may impact user experience. Check for:
|
||||
- Slow database queries
|
||||
- External API issues
|
||||
- Resource saturation
|
||||
runbook_url: "https://runbooks.example.com/high-latency"
|
||||
dashboard: "https://grafana.example.com/d/webapp-latency"
|
||||
|
||||
# Very high latency (p99)
|
||||
- alert: HighLatencyP99
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
|
||||
) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "Critical latency degradation - {{ $labels.job }}"
|
||||
description: |
|
||||
P99 request latency is {{ $value }}s, exceeding 2s threshold.
|
||||
|
||||
Severe performance degradation detected.
|
||||
runbook_url: "https://runbooks.example.com/high-latency"
|
||||
|
||||
- name: webapp_resources
|
||||
interval: 30s
|
||||
rules:
|
||||
# High CPU
|
||||
- alert: HighCPU
|
||||
expr: |
|
||||
rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "High CPU usage - {{ $labels.instance }}"
|
||||
description: |
|
||||
CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}.
|
||||
|
||||
Consider scaling up or investigating CPU-intensive operations.
|
||||
runbook_url: "https://runbooks.example.com/high-cpu"
|
||||
|
||||
# High memory
|
||||
- alert: HighMemory
|
||||
expr: |
|
||||
(process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "High memory usage - {{ $labels.instance }}"
|
||||
description: |
|
||||
Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}.
|
||||
|
||||
Check for memory leaks or consider scaling up.
|
||||
runbook_url: "https://runbooks.example.com/high-memory"
|
||||
|
||||
- name: webapp_traffic
|
||||
interval: 30s
|
||||
rules:
|
||||
# Traffic spike
|
||||
- alert: TrafficSpike
|
||||
expr: |
|
||||
sum(rate(http_requests_total{job="webapp"}[5m]))
|
||||
>
|
||||
1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "Traffic spike detected - {{ $labels.job }}"
|
||||
description: |
|
||||
Request rate increased by 50% compared to 1 hour ago.
|
||||
|
||||
Current: {{ $value | humanize }} req/s
|
||||
|
||||
This could be:
|
||||
- Legitimate traffic increase
|
||||
- DDoS attack
|
||||
- Retry storm
|
||||
|
||||
Monitor closely and be ready to scale.
|
||||
runbook_url: "https://runbooks.example.com/traffic-spike"
|
||||
|
||||
# Traffic drop (potential issue)
|
||||
- alert: TrafficDrop
|
||||
expr: |
|
||||
sum(rate(http_requests_total{job="webapp"}[5m]))
|
||||
<
|
||||
0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: webapp
|
||||
annotations:
|
||||
summary: "Traffic drop detected - {{ $labels.job }}"
|
||||
description: |
|
||||
Request rate dropped by 50% compared to 1 hour ago.
|
||||
|
||||
This could indicate:
|
||||
- Upstream service issue
|
||||
- DNS problems
|
||||
- Load balancer misconfiguration
|
||||
runbook_url: "https://runbooks.example.com/traffic-drop"
|
||||
|
||||
- name: webapp_dependencies
|
||||
interval: 30s
|
||||
rules:
|
||||
# Database connection pool exhaustion
|
||||
- alert: DatabasePoolExhausted
|
||||
expr: |
|
||||
(db_connection_pool_active / db_connection_pool_max) > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: backend
|
||||
component: database
|
||||
annotations:
|
||||
summary: "Database connection pool near exhaustion"
|
||||
description: |
|
||||
Connection pool is {{ $value | humanizePercentage }} full.
|
||||
|
||||
This will cause request failures. Immediate action required.
|
||||
runbook_url: "https://runbooks.example.com/db-pool-exhausted"
|
||||
|
||||
# External API errors
|
||||
- alert: ExternalAPIErrors
|
||||
expr: |
|
||||
sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api)
|
||||
/
|
||||
sum(rate(external_api_requests_total[5m])) by (api)
|
||||
> 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: backend
|
||||
component: integration
|
||||
annotations:
|
||||
summary: "High error rate from external API - {{ $labels.api }}"
|
||||
description: |
|
||||
{{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate.
|
||||
|
||||
Check API status page and consider enabling circuit breaker.
|
||||
runbook_url: "https://runbooks.example.com/external-api-errors"
|
||||
409
assets/templates/runbooks/incident-runbook-template.md
Normal file
409
assets/templates/runbooks/incident-runbook-template.md
Normal file
@@ -0,0 +1,409 @@
|
||||
# Runbook: [Alert Name]
|
||||
|
||||
## Overview
|
||||
|
||||
**Alert Name**: [e.g., HighLatency, ServiceDown, ErrorBudgetBurn]
|
||||
|
||||
**Severity**: [Critical | Warning | Info]
|
||||
|
||||
**Team**: [e.g., Backend, Platform, Database]
|
||||
|
||||
**Component**: [e.g., API Gateway, User Service, PostgreSQL]
|
||||
|
||||
**What it means**: [One-line description of what this alert indicates]
|
||||
|
||||
**User impact**: [How does this affect users? High/Medium/Low]
|
||||
|
||||
**Urgency**: [How quickly must this be addressed? Immediate/Hours/Days]
|
||||
|
||||
---
|
||||
|
||||
## Alert Details
|
||||
|
||||
### When This Alert Fires
|
||||
|
||||
This alert fires when:
|
||||
- [Specific condition, e.g., "P95 latency exceeds 500ms for 10 minutes"]
|
||||
- [Any additional conditions]
|
||||
|
||||
### Symptoms
|
||||
|
||||
Users will experience:
|
||||
- [ ] Slow response times
|
||||
- [ ] Errors or failures
|
||||
- [ ] Service unavailable
|
||||
- [ ] [Other symptoms]
|
||||
|
||||
### Probable Causes
|
||||
|
||||
Common causes include:
|
||||
1. **[Cause 1]**: [Description]
|
||||
- Example: Database overload due to slow queries
|
||||
2. **[Cause 2]**: [Description]
|
||||
- Example: Memory leak causing OOM errors
|
||||
3. **[Cause 3]**: [Description]
|
||||
- Example: Upstream service degradation
|
||||
|
||||
---
|
||||
|
||||
## Investigation Steps
|
||||
|
||||
### 1. Check Service Health
|
||||
|
||||
**Dashboard**: [Link to primary dashboard]
|
||||
|
||||
**Key metrics to check**:
|
||||
```bash
|
||||
# Request rate
|
||||
sum(rate(http_requests_total[5m]))
|
||||
|
||||
# Error rate
|
||||
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))
|
||||
|
||||
# Latency (p95, p99)
|
||||
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
|
||||
```
|
||||
|
||||
**What to look for**:
|
||||
- [ ] Has traffic spiked recently?
|
||||
- [ ] Is error rate elevated?
|
||||
- [ ] Are any endpoints particularly slow?
|
||||
|
||||
### 2. Check Recent Changes
|
||||
|
||||
**Deployments**:
|
||||
```bash
|
||||
# Kubernetes
|
||||
kubectl rollout history deployment/[service-name] -n [namespace]
|
||||
|
||||
# Check when last deployed
|
||||
kubectl get pods -n [namespace] -o wide | grep [service-name]
|
||||
```
|
||||
|
||||
**What to look for**:
|
||||
- [ ] Was there a recent deployment?
|
||||
- [ ] Did alert start after deployment?
|
||||
- [ ] Any configuration changes?
|
||||
|
||||
### 3. Check Logs
|
||||
|
||||
**Log query** (adjust for your log system):
|
||||
```bash
|
||||
# Kubernetes
|
||||
kubectl logs deployment/[service-name] -n [namespace] --tail=100 | grep ERROR
|
||||
|
||||
# Elasticsearch/Kibana
|
||||
GET /logs-*/_search
|
||||
{
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{ "match": { "service": "[service-name]" } },
|
||||
{ "match": { "level": "error" } },
|
||||
{ "range": { "@timestamp": { "gte": "now-30m" } } }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Loki/LogQL
|
||||
{job="[service-name]"} |= "error" | json | level="error"
|
||||
```
|
||||
|
||||
**What to look for**:
|
||||
- [ ] Repeated error messages
|
||||
- [ ] Stack traces
|
||||
- [ ] Connection errors
|
||||
- [ ] Timeout errors
|
||||
|
||||
### 4. Check Dependencies
|
||||
|
||||
**Database**:
|
||||
```bash
|
||||
# Check active connections
|
||||
SELECT count(*) FROM pg_stat_activity WHERE state = 'active';
|
||||
|
||||
# Check slow queries
|
||||
SELECT pid, now() - pg_stat_activity.query_start AS duration, query
|
||||
FROM pg_stat_activity
|
||||
WHERE state = 'active' AND now() - pg_stat_activity.query_start > interval '5 seconds';
|
||||
```
|
||||
|
||||
**External APIs**:
|
||||
- [ ] Check status pages: [Link to status pages]
|
||||
- [ ] Check API error rates in dashboard
|
||||
- [ ] Test API endpoints manually
|
||||
|
||||
**Cache** (Redis/Memcached):
|
||||
```bash
|
||||
# Redis info
|
||||
redis-cli -h [host] INFO stats
|
||||
|
||||
# Check memory usage
|
||||
redis-cli -h [host] INFO memory
|
||||
```
|
||||
|
||||
### 5. Check Resource Usage
|
||||
|
||||
**CPU and Memory**:
|
||||
```bash
|
||||
# Kubernetes
|
||||
kubectl top pods -n [namespace] | grep [service-name]
|
||||
|
||||
# Node metrics
|
||||
kubectl top nodes
|
||||
```
|
||||
|
||||
**Prometheus queries**:
|
||||
```promql
|
||||
# CPU usage by pod
|
||||
sum(rate(container_cpu_usage_seconds_total{pod=~"[service-name].*"}[5m])) by (pod)
|
||||
|
||||
# Memory usage by pod
|
||||
sum(container_memory_usage_bytes{pod=~"[service-name].*"}) by (pod)
|
||||
```
|
||||
|
||||
**What to look for**:
|
||||
- [ ] CPU throttling
|
||||
- [ ] Memory approaching limits
|
||||
- [ ] Disk space issues
|
||||
|
||||
### 6. Check Traces (if available)
|
||||
|
||||
**Trace query**:
|
||||
```bash
|
||||
# Jaeger
|
||||
# Search for slow traces (> 1s) in last 30 minutes
|
||||
|
||||
# Tempo/TraceQL
|
||||
{ duration > 1s && resource.service.name = "[service-name]" }
|
||||
```
|
||||
|
||||
**What to look for**:
|
||||
- [ ] Which operation is slow?
|
||||
- [ ] Where is time spent? (DB, external API, service logic)
|
||||
- [ ] Any N+1 query patterns?
|
||||
|
||||
---
|
||||
|
||||
## Common Scenarios and Solutions
|
||||
|
||||
### Scenario 1: Recent Deployment Caused Issue
|
||||
|
||||
**Symptoms**:
|
||||
- Alert started immediately after deployment
|
||||
- Error logs correlate with new code
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Rollback deployment
|
||||
kubectl rollout undo deployment/[service-name] -n [namespace]
|
||||
|
||||
# Verify rollback succeeded
|
||||
kubectl rollout status deployment/[service-name] -n [namespace]
|
||||
|
||||
# Monitor for alert resolution
|
||||
```
|
||||
|
||||
**Follow-up**:
|
||||
- [ ] Create incident report
|
||||
- [ ] Review deployment process
|
||||
- [ ] Add pre-deployment checks
|
||||
|
||||
### Scenario 2: Database Performance Issue
|
||||
|
||||
**Symptoms**:
|
||||
- Slow query logs show problematic queries
|
||||
- Database CPU or connection pool exhausted
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Identify slow query
|
||||
# Kill long-running query (use with caution)
|
||||
SELECT pg_cancel_backend([pid]);
|
||||
|
||||
# Or terminate if cancel doesn't work
|
||||
SELECT pg_terminate_backend([pid]);
|
||||
|
||||
# Add index if missing (in maintenance window)
|
||||
CREATE INDEX CONCURRENTLY idx_name ON table_name (column_name);
|
||||
```
|
||||
|
||||
**Follow-up**:
|
||||
- [ ] Add query performance test
|
||||
- [ ] Review and optimize query
|
||||
- [ ] Consider read replicas
|
||||
|
||||
### Scenario 3: Memory Leak
|
||||
|
||||
**Symptoms**:
|
||||
- Memory usage gradually increasing
|
||||
- Eventually OOMKilled
|
||||
- Restarts temporarily fix issue
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Immediate: Restart pods
|
||||
kubectl rollout restart deployment/[service-name] -n [namespace]
|
||||
|
||||
# Increase memory limits (temporary)
|
||||
kubectl set resources deployment/[service-name] -n [namespace] \
|
||||
--limits=memory=2Gi
|
||||
```
|
||||
|
||||
**Follow-up**:
|
||||
- [ ] Profile application for memory leaks
|
||||
- [ ] Add memory usage alerts
|
||||
- [ ] Fix root cause
|
||||
|
||||
### Scenario 4: Traffic Spike / DDoS
|
||||
|
||||
**Symptoms**:
|
||||
- Sudden traffic increase
|
||||
- Traffic from unusual sources
|
||||
- High CPU/memory across all instances
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Scale up immediately
|
||||
kubectl scale deployment/[service-name] -n [namespace] --replicas=10
|
||||
|
||||
# Enable rate limiting at load balancer level
|
||||
# (Specific steps depend on LB)
|
||||
|
||||
# Block suspicious IPs if confirmed DDoS
|
||||
# (Use WAF or network policies)
|
||||
```
|
||||
|
||||
**Follow-up**:
|
||||
- [ ] Implement rate limiting
|
||||
- [ ] Add DDoS protection (CloudFlare, WAF)
|
||||
- [ ] Set up auto-scaling
|
||||
|
||||
### Scenario 5: Upstream Service Degradation
|
||||
|
||||
**Symptoms**:
|
||||
- Errors calling external API
|
||||
- Timeouts to upstream service
|
||||
- Upstream status page shows issues
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Enable circuit breaker (if available)
|
||||
# Adjust timeout configuration
|
||||
# Switch to backup service/cached data
|
||||
|
||||
# Monitor external service
|
||||
# Check status page: [Link]
|
||||
```
|
||||
|
||||
**Follow-up**:
|
||||
- [ ] Implement circuit breaker pattern
|
||||
- [ ] Add fallback mechanisms
|
||||
- [ ] Set up external service monitoring
|
||||
|
||||
---
|
||||
|
||||
## Immediate Actions (< 5 minutes)
|
||||
|
||||
These should be done first to mitigate impact:
|
||||
|
||||
1. **[Action 1]**: [e.g., "Scale up service"]
|
||||
```bash
|
||||
kubectl scale deployment/[service] --replicas=10
|
||||
```
|
||||
|
||||
2. **[Action 2]**: [e.g., "Rollback deployment"]
|
||||
```bash
|
||||
kubectl rollout undo deployment/[service]
|
||||
```
|
||||
|
||||
3. **[Action 3]**: [e.g., "Enable circuit breaker"]
|
||||
|
||||
---
|
||||
|
||||
## Short-term Actions (< 30 minutes)
|
||||
|
||||
After immediate mitigation:
|
||||
|
||||
1. **[Action 1]**: [e.g., "Investigate root cause"]
|
||||
2. **[Action 2]**: [e.g., "Optimize slow query"]
|
||||
3. **[Action 3]**: [e.g., "Clear cache if stale"]
|
||||
|
||||
---
|
||||
|
||||
## Long-term Actions (Post-Incident)
|
||||
|
||||
Preventive measures:
|
||||
|
||||
1. **[Action 1]**: [e.g., "Add circuit breaker"]
|
||||
2. **[Action 2]**: [e.g., "Implement auto-scaling"]
|
||||
3. **[Action 3]**: [e.g., "Add query performance tests"]
|
||||
4. **[Action 4]**: [e.g., "Update alert thresholds"]
|
||||
|
||||
---
|
||||
|
||||
## Escalation
|
||||
|
||||
If issue persists after 30 minutes:
|
||||
|
||||
**Escalation Path**:
|
||||
1. **Primary oncall**: @[username] ([slack/email])
|
||||
2. **Team lead**: @[username] ([slack/email])
|
||||
3. **Engineering manager**: @[username] ([slack/email])
|
||||
4. **Incident commander**: @[username] ([slack/email])
|
||||
|
||||
**Communication**:
|
||||
- **Slack channel**: #[incidents-channel]
|
||||
- **Status page**: [Link]
|
||||
- **Incident tracking**: [Link to incident management tool]
|
||||
|
||||
---
|
||||
|
||||
## Related Runbooks
|
||||
|
||||
- [Related Runbook 1]
|
||||
- [Related Runbook 2]
|
||||
- [Related Runbook 3]
|
||||
|
||||
## Related Dashboards
|
||||
|
||||
- [Main Service Dashboard]
|
||||
- [Resource Usage Dashboard]
|
||||
- [Dependency Dashboard]
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Architecture Diagram]
|
||||
- [Service Documentation]
|
||||
- [API Documentation]
|
||||
|
||||
---
|
||||
|
||||
## Recent Incidents
|
||||
|
||||
| Date | Duration | Root Cause | Resolution | Ticket |
|
||||
|------|----------|------------|------------|--------|
|
||||
| 2024-10-15 | 23 min | Database pool exhausted | Increased pool size | INC-123 |
|
||||
| 2024-09-30 | 45 min | Memory leak | Fixed code, restarted | INC-120 |
|
||||
|
||||
---
|
||||
|
||||
## Runbook Metadata
|
||||
|
||||
**Last Updated**: [Date]
|
||||
|
||||
**Owner**: [Team name]
|
||||
|
||||
**Reviewers**: [Names]
|
||||
|
||||
**Next Review**: [Date]
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- This runbook should be reviewed quarterly
|
||||
- Update after each incident to capture new learnings
|
||||
- Keep investigation steps concise and actionable
|
||||
- Include actual commands that can be copy-pasted
|
||||
Reference in New Issue
Block a user