Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions

View File

@@ -0,0 +1,293 @@
---
# Prometheus Alert Rules for Kubernetes
# Covers pods, nodes, deployments, and resource usage
groups:
- name: kubernetes_pods
interval: 30s
rules:
# Pod crash looping
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Pod is crash looping - {{ $labels.namespace }}/{{ $labels.pod }}"
description: |
Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes.
Check pod logs:
kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} --previous
runbook_url: "https://runbooks.example.com/pod-crash-loop"
# Pod not ready
- alert: PodNotReady
expr: |
sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
for: 10m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Pod not ready - {{ $labels.namespace }}/{{ $labels.pod }}"
description: |
Pod {{ $labels.namespace }}/{{ $labels.pod }} is in {{ $labels.phase }} state for 10 minutes.
Investigate:
kubectl describe pod -n {{ $labels.namespace }} {{ $labels.pod }}
runbook_url: "https://runbooks.example.com/pod-not-ready"
# Pod OOMKilled
- alert: PodOOMKilled
expr: |
sum by (namespace, pod) (kube_pod_container_status_terminated_reason{reason="OOMKilled"}) > 0
for: 1m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Pod killed due to OOM - {{ $labels.namespace }}/{{ $labels.pod }}"
description: |
Pod {{ $labels.namespace }}/{{ $labels.pod }} was killed due to out-of-memory.
Increase memory limits or investigate memory leak.
runbook_url: "https://runbooks.example.com/oom-killed"
- name: kubernetes_deployments
interval: 30s
rules:
# Deployment replica mismatch
- alert: DeploymentReplicasMismatch
expr: |
kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 15m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Deployment replicas mismatch - {{ $labels.namespace }}/{{ $labels.deployment }}"
description: |
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has been running with
fewer replicas than desired for 15 minutes.
Desired: {{ $value }}
Available: Check deployment status
runbook_url: "https://runbooks.example.com/replica-mismatch"
# Deployment rollout stuck
- alert: DeploymentRolloutStuck
expr: |
kube_deployment_status_condition{condition="Progressing", status="false"} > 0
for: 15m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Deployment rollout stuck - {{ $labels.namespace }}/{{ $labels.deployment }}"
description: |
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} rollout is stuck.
Check rollout status:
kubectl rollout status deployment/{{ $labels.deployment }} -n {{ $labels.namespace }}
runbook_url: "https://runbooks.example.com/rollout-stuck"
- name: kubernetes_nodes
interval: 30s
rules:
# Node not ready
- alert: NodeNotReady
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
team: platform
component: kubernetes
annotations:
summary: "Node not ready - {{ $labels.node }}"
description: |
Node {{ $labels.node }} has been NotReady for 5 minutes.
This will affect pod scheduling and availability.
Check node status:
kubectl describe node {{ $labels.node }}
runbook_url: "https://runbooks.example.com/node-not-ready"
# Node memory pressure
- alert: NodeMemoryPressure
expr: |
kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 5m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Node under memory pressure - {{ $labels.node }}"
description: |
Node {{ $labels.node }} is experiencing memory pressure.
Pods may be evicted. Consider scaling up or evicting low-priority pods.
runbook_url: "https://runbooks.example.com/memory-pressure"
# Node disk pressure
- alert: NodeDiskPressure
expr: |
kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 5m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Node under disk pressure - {{ $labels.node }}"
description: |
Node {{ $labels.node }} is experiencing disk pressure.
Clean up disk space or add capacity.
runbook_url: "https://runbooks.example.com/disk-pressure"
# Node high CPU
- alert: NodeHighCPU
expr: |
(1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) * 100 > 80
for: 15m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Node high CPU usage - {{ $labels.instance }}"
description: |
Node {{ $labels.instance }} CPU usage is {{ $value | humanize }}%.
Check for resource-intensive pods or scale cluster.
runbook_url: "https://runbooks.example.com/node-high-cpu"
- name: kubernetes_resources
interval: 30s
rules:
# Container CPU throttling
- alert: ContainerCPUThrottling
expr: |
rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
for: 10m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Container CPU throttling - {{ $labels.namespace }}/{{ $labels.pod }}"
description: |
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
is being CPU throttled.
CPU throttling rate: {{ $value | humanize }}
Consider increasing CPU limits.
runbook_url: "https://runbooks.example.com/cpu-throttling"
# Container memory usage high
- alert: ContainerMemoryUsageHigh
expr: |
(container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9
for: 10m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Container memory usage high - {{ $labels.namespace }}/{{ $labels.pod }}"
description: |
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
is using {{ $value | humanizePercentage }} of its memory limit.
Risk of OOMKill. Consider increasing memory limits.
runbook_url: "https://runbooks.example.com/high-memory"
- name: kubernetes_pv
interval: 30s
rules:
# PersistentVolume nearing full
- alert: PersistentVolumeFillingUp
expr: |
(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.15
for: 10m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "PersistentVolume filling up - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
description: |
PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
is {{ $value | humanizePercentage }} full.
Available space is running low. Consider expanding volume.
runbook_url: "https://runbooks.example.com/pv-filling-up"
# PersistentVolume critically full
- alert: PersistentVolumeCriticallyFull
expr: |
(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.05
for: 5m
labels:
severity: critical
team: platform
component: kubernetes
annotations:
summary: "PersistentVolume critically full - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
description: |
PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
is {{ $value | humanizePercentage }} full.
Immediate action required to prevent application failures.
runbook_url: "https://runbooks.example.com/pv-critically-full"
- name: kubernetes_jobs
interval: 30s
rules:
# Job failed
- alert: JobFailed
expr: |
kube_job_status_failed > 0
for: 5m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "Job failed - {{ $labels.namespace }}/{{ $labels.job_name }}"
description: |
Job {{ $labels.namespace }}/{{ $labels.job_name }} has failed.
Check job logs:
kubectl logs job/{{ $labels.job_name }} -n {{ $labels.namespace }}
runbook_url: "https://runbooks.example.com/job-failed"
# CronJob not running
- alert: CronJobNotRunning
expr: |
time() - kube_cronjob_status_last_schedule_time > 3600
for: 10m
labels:
severity: warning
team: platform
component: kubernetes
annotations:
summary: "CronJob not running - {{ $labels.namespace }}/{{ $labels.cronjob }}"
description: |
CronJob {{ $labels.namespace}}/{{ $labels.cronjob }} hasn't run in over an hour.
Check CronJob status:
kubectl describe cronjob {{ $labels.cronjob }} -n {{ $labels.namespace }}
runbook_url: "https://runbooks.example.com/cronjob-not-running"

View File

@@ -0,0 +1,243 @@
---
# Prometheus Alert Rules for Web Applications
# Based on SLO best practices and multi-window burn rate alerting
groups:
- name: webapp_availability
interval: 30s
rules:
# Fast burn rate alert (1h window) - SLO: 99.9%
- alert: ErrorBudgetFastBurn
expr: |
(
sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h]))
/
sum(rate(http_requests_total{job="webapp"}[1h]))
) > (14.4 * 0.001)
for: 2m
labels:
severity: critical
team: backend
component: webapp
annotations:
summary: "Fast error budget burn - {{ $labels.job }}"
description: |
Error rate is {{ $value | humanizePercentage }} over the last hour,
burning through error budget at 14.4x rate.
At this rate, the monthly error budget will be exhausted in 2 days.
Immediate investigation required.
runbook_url: "https://runbooks.example.com/error-budget-burn"
dashboard: "https://grafana.example.com/d/webapp"
# Slow burn rate alert (6h window)
- alert: ErrorBudgetSlowBurn
expr: |
(
sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h]))
/
sum(rate(http_requests_total{job="webapp"}[6h]))
) > (6 * 0.001)
for: 30m
labels:
severity: warning
team: backend
component: webapp
annotations:
summary: "Elevated error budget burn - {{ $labels.job }}"
description: |
Error rate is {{ $value | humanizePercentage }} over the last 6 hours,
burning through error budget at 6x rate.
Monitor closely and investigate if trend continues.
runbook_url: "https://runbooks.example.com/error-budget-burn"
# Service down alert
- alert: WebAppDown
expr: up{job="webapp"} == 0
for: 2m
labels:
severity: critical
team: backend
component: webapp
annotations:
summary: "Web application is down - {{ $labels.instance }}"
description: |
Web application instance {{ $labels.instance }} has been down for 2 minutes.
Check service health and logs immediately.
runbook_url: "https://runbooks.example.com/service-down"
- name: webapp_latency
interval: 30s
rules:
# High latency (p95)
- alert: HighLatencyP95
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
) > 0.5
for: 10m
labels:
severity: warning
team: backend
component: webapp
annotations:
summary: "High p95 latency - {{ $labels.job }}"
description: |
P95 request latency is {{ $value }}s, exceeding 500ms threshold.
This may impact user experience. Check for:
- Slow database queries
- External API issues
- Resource saturation
runbook_url: "https://runbooks.example.com/high-latency"
dashboard: "https://grafana.example.com/d/webapp-latency"
# Very high latency (p99)
- alert: HighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
) > 2
for: 5m
labels:
severity: critical
team: backend
component: webapp
annotations:
summary: "Critical latency degradation - {{ $labels.job }}"
description: |
P99 request latency is {{ $value }}s, exceeding 2s threshold.
Severe performance degradation detected.
runbook_url: "https://runbooks.example.com/high-latency"
- name: webapp_resources
interval: 30s
rules:
# High CPU
- alert: HighCPU
expr: |
rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80
for: 15m
labels:
severity: warning
team: backend
component: webapp
annotations:
summary: "High CPU usage - {{ $labels.instance }}"
description: |
CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}.
Consider scaling up or investigating CPU-intensive operations.
runbook_url: "https://runbooks.example.com/high-cpu"
# High memory
- alert: HighMemory
expr: |
(process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80
for: 15m
labels:
severity: warning
team: backend
component: webapp
annotations:
summary: "High memory usage - {{ $labels.instance }}"
description: |
Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}.
Check for memory leaks or consider scaling up.
runbook_url: "https://runbooks.example.com/high-memory"
- name: webapp_traffic
interval: 30s
rules:
# Traffic spike
- alert: TrafficSpike
expr: |
sum(rate(http_requests_total{job="webapp"}[5m]))
>
1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
for: 10m
labels:
severity: warning
team: backend
component: webapp
annotations:
summary: "Traffic spike detected - {{ $labels.job }}"
description: |
Request rate increased by 50% compared to 1 hour ago.
Current: {{ $value | humanize }} req/s
This could be:
- Legitimate traffic increase
- DDoS attack
- Retry storm
Monitor closely and be ready to scale.
runbook_url: "https://runbooks.example.com/traffic-spike"
# Traffic drop (potential issue)
- alert: TrafficDrop
expr: |
sum(rate(http_requests_total{job="webapp"}[5m]))
<
0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
for: 10m
labels:
severity: warning
team: backend
component: webapp
annotations:
summary: "Traffic drop detected - {{ $labels.job }}"
description: |
Request rate dropped by 50% compared to 1 hour ago.
This could indicate:
- Upstream service issue
- DNS problems
- Load balancer misconfiguration
runbook_url: "https://runbooks.example.com/traffic-drop"
- name: webapp_dependencies
interval: 30s
rules:
# Database connection pool exhaustion
- alert: DatabasePoolExhausted
expr: |
(db_connection_pool_active / db_connection_pool_max) > 0.9
for: 5m
labels:
severity: critical
team: backend
component: database
annotations:
summary: "Database connection pool near exhaustion"
description: |
Connection pool is {{ $value | humanizePercentage }} full.
This will cause request failures. Immediate action required.
runbook_url: "https://runbooks.example.com/db-pool-exhausted"
# External API errors
- alert: ExternalAPIErrors
expr: |
sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api)
/
sum(rate(external_api_requests_total[5m])) by (api)
> 0.1
for: 5m
labels:
severity: warning
team: backend
component: integration
annotations:
summary: "High error rate from external API - {{ $labels.api }}"
description: |
{{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate.
Check API status page and consider enabling circuit breaker.
runbook_url: "https://runbooks.example.com/external-api-errors"