Initial commit

2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions
--- a/assets/templates/prometheus-alerts/kubernetes-alerts.yml
+++ b/assets/templates/prometheus-alerts/kubernetes-alerts.yml
@@ -0,0 +1,293 @@
+---
+# Prometheus Alert Rules for Kubernetes
+# Covers pods, nodes, deployments, and resource usage
+
+groups:
+  - name: kubernetes_pods
+    interval: 30s
+    rules:
+      # Pod crash looping
+      - alert: PodCrashLooping
+        expr: |
+          rate(kube_pod_container_status_restarts_total[15m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Pod is crash looping - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes.
+
+            Check pod logs:
+            kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} --previous
+          runbook_url: "https://runbooks.example.com/pod-crash-loop"
+
+      # Pod not ready
+      - alert: PodNotReady
+        expr: |
+          sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Pod not ready - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Pod {{ $labels.namespace }}/{{ $labels.pod }} is in {{ $labels.phase }} state for 10 minutes.
+
+            Investigate:
+            kubectl describe pod -n {{ $labels.namespace }} {{ $labels.pod }}
+          runbook_url: "https://runbooks.example.com/pod-not-ready"
+
+      # Pod OOMKilled
+      - alert: PodOOMKilled
+        expr: |
+          sum by (namespace, pod) (kube_pod_container_status_terminated_reason{reason="OOMKilled"}) > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Pod killed due to OOM - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Pod {{ $labels.namespace }}/{{ $labels.pod }} was killed due to out-of-memory.
+
+            Increase memory limits or investigate memory leak.
+          runbook_url: "https://runbooks.example.com/oom-killed"
+
+  - name: kubernetes_deployments
+    interval: 30s
+    rules:
+      # Deployment replica mismatch
+      - alert: DeploymentReplicasMismatch
+        expr: |
+          kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+        for: 15m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Deployment replicas mismatch - {{ $labels.namespace }}/{{ $labels.deployment }}"
+          description: |
+            Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has been running with
+            fewer replicas than desired for 15 minutes.
+
+            Desired: {{ $value }}
+            Available: Check deployment status
+          runbook_url: "https://runbooks.example.com/replica-mismatch"
+
+      # Deployment rollout stuck
+      - alert: DeploymentRolloutStuck
+        expr: |
+          kube_deployment_status_condition{condition="Progressing", status="false"} > 0
+        for: 15m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Deployment rollout stuck - {{ $labels.namespace }}/{{ $labels.deployment }}"
+          description: |
+            Deployment {{ $labels.namespace }}/{{ $labels.deployment }} rollout is stuck.
+
+            Check rollout status:
+            kubectl rollout status deployment/{{ $labels.deployment }} -n {{ $labels.namespace }}
+          runbook_url: "https://runbooks.example.com/rollout-stuck"
+
+  - name: kubernetes_nodes
+    interval: 30s
+    rules:
+      # Node not ready
+      - alert: NodeNotReady
+        expr: |
+          kube_node_status_condition{condition="Ready",status="true"} == 0
+        for: 5m
+        labels:
+          severity: critical
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node not ready - {{ $labels.node }}"
+          description: |
+            Node {{ $labels.node }} has been NotReady for 5 minutes.
+
+            This will affect pod scheduling and availability.
+
+            Check node status:
+            kubectl describe node {{ $labels.node }}
+          runbook_url: "https://runbooks.example.com/node-not-ready"
+
+      # Node memory pressure
+      - alert: NodeMemoryPressure
+        expr: |
+          kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node under memory pressure - {{ $labels.node }}"
+          description: |
+            Node {{ $labels.node }} is experiencing memory pressure.
+
+            Pods may be evicted. Consider scaling up or evicting low-priority pods.
+          runbook_url: "https://runbooks.example.com/memory-pressure"
+
+      # Node disk pressure
+      - alert: NodeDiskPressure
+        expr: |
+          kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node under disk pressure - {{ $labels.node }}"
+          description: |
+            Node {{ $labels.node }} is experiencing disk pressure.
+
+            Clean up disk space or add capacity.
+          runbook_url: "https://runbooks.example.com/disk-pressure"
+
+      # Node high CPU
+      - alert: NodeHighCPU
+        expr: |
+          (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) * 100 > 80
+        for: 15m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node high CPU usage - {{ $labels.instance }}"
+          description: |
+            Node {{ $labels.instance }} CPU usage is {{ $value | humanize }}%.
+
+            Check for resource-intensive pods or scale cluster.
+          runbook_url: "https://runbooks.example.com/node-high-cpu"
+
+  - name: kubernetes_resources
+    interval: 30s
+    rules:
+      # Container CPU throttling
+      - alert: ContainerCPUThrottling
+        expr: |
+          rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Container CPU throttling - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
+            is being CPU throttled.
+
+            CPU throttling rate: {{ $value | humanize }}
+
+            Consider increasing CPU limits.
+          runbook_url: "https://runbooks.example.com/cpu-throttling"
+
+      # Container memory usage high
+      - alert: ContainerMemoryUsageHigh
+        expr: |
+          (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Container memory usage high - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
+            is using {{ $value | humanizePercentage }} of its memory limit.
+
+            Risk of OOMKill. Consider increasing memory limits.
+          runbook_url: "https://runbooks.example.com/high-memory"
+
+  - name: kubernetes_pv
+    interval: 30s
+    rules:
+      # PersistentVolume nearing full
+      - alert: PersistentVolumeFillingUp
+        expr: |
+          (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.15
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "PersistentVolume filling up - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
+          description: |
+            PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
+            is {{ $value | humanizePercentage }} full.
+
+            Available space is running low. Consider expanding volume.
+          runbook_url: "https://runbooks.example.com/pv-filling-up"
+
+      # PersistentVolume critically full
+      - alert: PersistentVolumeCriticallyFull
+        expr: |
+          (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.05
+        for: 5m
+        labels:
+          severity: critical
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "PersistentVolume critically full - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
+          description: |
+            PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
+            is {{ $value | humanizePercentage }} full.
+
+            Immediate action required to prevent application failures.
+          runbook_url: "https://runbooks.example.com/pv-critically-full"
+
+  - name: kubernetes_jobs
+    interval: 30s
+    rules:
+      # Job failed
+      - alert: JobFailed
+        expr: |
+          kube_job_status_failed > 0
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Job failed - {{ $labels.namespace }}/{{ $labels.job_name }}"
+          description: |
+            Job {{ $labels.namespace }}/{{ $labels.job_name }} has failed.
+
+            Check job logs:
+            kubectl logs job/{{ $labels.job_name }} -n {{ $labels.namespace }}
+          runbook_url: "https://runbooks.example.com/job-failed"
+
+      # CronJob not running
+      - alert: CronJobNotRunning
+        expr: |
+          time() - kube_cronjob_status_last_schedule_time > 3600
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "CronJob not running - {{ $labels.namespace }}/{{ $labels.cronjob }}"
+          description: |
+            CronJob {{ $labels.namespace}}/{{ $labels.cronjob }} hasn't run in over an hour.
+
+            Check CronJob status:
+            kubectl describe cronjob {{ $labels.cronjob }} -n {{ $labels.namespace }}
+          runbook_url: "https://runbooks.example.com/cronjob-not-running"
--- a/assets/templates/prometheus-alerts/webapp-alerts.yml
+++ b/assets/templates/prometheus-alerts/webapp-alerts.yml
@@ -0,0 +1,243 @@
+---
+# Prometheus Alert Rules for Web Applications
+# Based on SLO best practices and multi-window burn rate alerting
+
+groups:
+  - name: webapp_availability
+    interval: 30s
+    rules:
+      # Fast burn rate alert (1h window) - SLO: 99.9%
+      - alert: ErrorBudgetFastBurn
+        expr: |
+          (
+            sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h]))
+            /
+            sum(rate(http_requests_total{job="webapp"}[1h]))
+          ) > (14.4 * 0.001)
+        for: 2m
+        labels:
+          severity: critical
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Fast error budget burn - {{ $labels.job }}"
+          description: |
+            Error rate is {{ $value | humanizePercentage }} over the last hour,
+            burning through error budget at 14.4x rate.
+
+            At this rate, the monthly error budget will be exhausted in 2 days.
+
+            Immediate investigation required.
+          runbook_url: "https://runbooks.example.com/error-budget-burn"
+          dashboard: "https://grafana.example.com/d/webapp"
+
+      # Slow burn rate alert (6h window)
+      - alert: ErrorBudgetSlowBurn
+        expr: |
+          (
+            sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h]))
+            /
+            sum(rate(http_requests_total{job="webapp"}[6h]))
+          ) > (6 * 0.001)
+        for: 30m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Elevated error budget burn - {{ $labels.job }}"
+          description: |
+            Error rate is {{ $value | humanizePercentage }} over the last 6 hours,
+            burning through error budget at 6x rate.
+
+            Monitor closely and investigate if trend continues.
+          runbook_url: "https://runbooks.example.com/error-budget-burn"
+
+      # Service down alert
+      - alert: WebAppDown
+        expr: up{job="webapp"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Web application is down - {{ $labels.instance }}"
+          description: |
+            Web application instance {{ $labels.instance }} has been down for 2 minutes.
+
+            Check service health and logs immediately.
+          runbook_url: "https://runbooks.example.com/service-down"
+
+  - name: webapp_latency
+    interval: 30s
+    rules:
+      # High latency (p95)
+      - alert: HighLatencyP95
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
+          ) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "High p95 latency - {{ $labels.job }}"
+          description: |
+            P95 request latency is {{ $value }}s, exceeding 500ms threshold.
+
+            This may impact user experience. Check for:
+            - Slow database queries
+            - External API issues
+            - Resource saturation
+          runbook_url: "https://runbooks.example.com/high-latency"
+          dashboard: "https://grafana.example.com/d/webapp-latency"
+
+      # Very high latency (p99)
+      - alert: HighLatencyP99
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
+          ) > 2
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Critical latency degradation - {{ $labels.job }}"
+          description: |
+            P99 request latency is {{ $value }}s, exceeding 2s threshold.
+
+            Severe performance degradation detected.
+          runbook_url: "https://runbooks.example.com/high-latency"
+
+  - name: webapp_resources
+    interval: 30s
+    rules:
+      # High CPU
+      - alert: HighCPU
+        expr: |
+          rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80
+        for: 15m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "High CPU usage - {{ $labels.instance }}"
+          description: |
+            CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}.
+
+            Consider scaling up or investigating CPU-intensive operations.
+          runbook_url: "https://runbooks.example.com/high-cpu"
+
+      # High memory
+      - alert: HighMemory
+        expr: |
+          (process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80
+        for: 15m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "High memory usage - {{ $labels.instance }}"
+          description: |
+            Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}.
+
+            Check for memory leaks or consider scaling up.
+          runbook_url: "https://runbooks.example.com/high-memory"
+
+  - name: webapp_traffic
+    interval: 30s
+    rules:
+      # Traffic spike
+      - alert: TrafficSpike
+        expr: |
+          sum(rate(http_requests_total{job="webapp"}[5m]))
+          >
+          1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Traffic spike detected - {{ $labels.job }}"
+          description: |
+            Request rate increased by 50% compared to 1 hour ago.
+
+            Current: {{ $value | humanize }} req/s
+
+            This could be:
+            - Legitimate traffic increase
+            - DDoS attack
+            - Retry storm
+
+            Monitor closely and be ready to scale.
+          runbook_url: "https://runbooks.example.com/traffic-spike"
+
+      # Traffic drop (potential issue)
+      - alert: TrafficDrop
+        expr: |
+          sum(rate(http_requests_total{job="webapp"}[5m]))
+          <
+          0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Traffic drop detected - {{ $labels.job }}"
+          description: |
+            Request rate dropped by 50% compared to 1 hour ago.
+
+            This could indicate:
+            - Upstream service issue
+            - DNS problems
+            - Load balancer misconfiguration
+          runbook_url: "https://runbooks.example.com/traffic-drop"
+
+  - name: webapp_dependencies
+    interval: 30s
+    rules:
+      # Database connection pool exhaustion
+      - alert: DatabasePoolExhausted
+        expr: |
+          (db_connection_pool_active / db_connection_pool_max) > 0.9
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+          component: database
+        annotations:
+          summary: "Database connection pool near exhaustion"
+          description: |
+            Connection pool is {{ $value | humanizePercentage }} full.
+
+            This will cause request failures. Immediate action required.
+          runbook_url: "https://runbooks.example.com/db-pool-exhausted"
+
+      # External API errors
+      - alert: ExternalAPIErrors
+        expr: |
+          sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api)
+          /
+          sum(rate(external_api_requests_total[5m])) by (api)
+          > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          team: backend
+          component: integration
+        annotations:
+          summary: "High error rate from external API - {{ $labels.api }}"
+          description: |
+            {{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate.
+
+            Check API status page and consider enabling circuit breaker.
+          runbook_url: "https://runbooks.example.com/external-api-errors"