Initial commit

2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions
--- a/assets/templates/otel-config/collector-config.yaml
+++ b/assets/templates/otel-config/collector-config.yaml
@@ -0,0 +1,227 @@
+# OpenTelemetry Collector Configuration
+# Receives metrics, logs, and traces and exports to various backends
+
+receivers:
+  # OTLP receiver (standard OpenTelemetry protocol)
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+  # Prometheus receiver (scrape Prometheus endpoints)
+  prometheus:
+    config:
+      scrape_configs:
+        - job_name: 'otel-collector'
+          scrape_interval: 30s
+          static_configs:
+            - targets: ['localhost:8888']
+
+  # Host metrics (CPU, memory, disk, network)
+  hostmetrics:
+    collection_interval: 30s
+    scrapers:
+      cpu:
+      memory:
+      disk:
+      network:
+      filesystem:
+      load:
+
+  # Kubernetes receiver (cluster metrics)
+  k8s_cluster:
+    auth_type: serviceAccount
+    node_conditions_to_report: [Ready, MemoryPressure, DiskPressure]
+    distribution: kubernetes
+
+  # Zipkin receiver (legacy tracing)
+  zipkin:
+    endpoint: 0.0.0.0:9411
+
+processors:
+  # Batch processor (improves performance)
+  batch:
+    timeout: 10s
+    send_batch_size: 1024
+    send_batch_max_size: 2048
+
+  # Memory limiter (prevent OOM)
+  memory_limiter:
+    check_interval: 1s
+    limit_mib: 512
+    spike_limit_mib: 128
+
+  # Resource processor (add resource attributes)
+  resource:
+    attributes:
+      - key: environment
+        value: production
+        action: insert
+      - key: cluster.name
+        value: prod-cluster
+        action: insert
+
+  # Attributes processor (modify span/metric attributes)
+  attributes:
+    actions:
+      - key: http.url
+        action: delete  # Remove potentially sensitive URLs
+      - key: db.statement
+        action: hash    # Hash SQL queries for privacy
+
+  # Filter processor (drop unwanted data)
+  filter:
+    metrics:
+      # Drop metrics matching criteria
+      exclude:
+        match_type: regexp
+        metric_names:
+          - ^go_.*      # Drop Go runtime metrics
+          - ^process_.* # Drop process metrics
+
+  # Tail sampling (intelligent trace sampling)
+  tail_sampling:
+    decision_wait: 10s
+    num_traces: 100
+    policies:
+      # Always sample errors
+      - name: error-policy
+        type: status_code
+        status_code:
+          status_codes: [ERROR]
+
+      # Sample slow traces
+      - name: latency-policy
+        type: latency
+        latency:
+          threshold_ms: 1000
+
+      # Sample 10% of others
+      - name: probabilistic-policy
+        type: probabilistic
+        probabilistic:
+          sampling_percentage: 10
+
+  # Span processor (modify spans)
+  span:
+    name:
+      to_attributes:
+        rules:
+          - ^\/api\/v1\/users\/(?P<user_id>.*)$
+      from_attributes:
+        - db.name
+        - http.method
+
+exporters:
+  # Prometheus exporter (expose metrics endpoint)
+  prometheus:
+    endpoint: 0.0.0.0:8889
+    namespace: otel
+
+  # OTLP exporters (send to backends)
+  otlp/tempo:
+    endpoint: tempo:4317
+    tls:
+      insecure: true
+
+  otlp/mimir:
+    endpoint: mimir:4317
+    tls:
+      insecure: true
+
+  # Loki exporter (for logs)
+  loki:
+    endpoint: http://loki:3100/loki/api/v1/push
+    labels:
+      resource:
+        service.name: "service_name"
+        service.namespace: "service_namespace"
+      attributes:
+        level: "level"
+
+  # Jaeger exporter (alternative tracing backend)
+  jaeger:
+    endpoint: jaeger:14250
+    tls:
+      insecure: true
+
+  # Elasticsearch exporter (for logs)
+  elasticsearch:
+    endpoints:
+      - http://elasticsearch:9200
+    logs_index: otel-logs
+    traces_index: otel-traces
+
+  # CloudWatch exporter (AWS)
+  awscloudwatch:
+    region: us-east-1
+    namespace: MyApp
+    log_group_name: /aws/otel/logs
+    log_stream_name: otel-collector
+
+  # Datadog exporter
+  datadog:
+    api:
+      key: ${DD_API_KEY}
+      site: datadoghq.com
+
+  # File exporter (debugging)
+  file:
+    path: /tmp/otel-output.json
+
+  # Logging exporter (console output for debugging)
+  logging:
+    verbosity: detailed
+    sampling_initial: 5
+    sampling_thereafter: 200
+
+extensions:
+  # Health check endpoint
+  health_check:
+    endpoint: 0.0.0.0:13133
+
+  # Pprof endpoint (for profiling)
+  pprof:
+    endpoint: 0.0.0.0:1777
+
+  # ZPages (internal diagnostics)
+  zpages:
+    endpoint: 0.0.0.0:55679
+
+service:
+  extensions: [health_check, pprof, zpages]
+
+  pipelines:
+    # Traces pipeline
+    traces:
+      receivers: [otlp, zipkin]
+      processors: [memory_limiter, batch, tail_sampling, resource, span]
+      exporters: [otlp/tempo, jaeger, logging]
+
+    # Metrics pipeline
+    metrics:
+      receivers: [otlp, prometheus, hostmetrics, k8s_cluster]
+      processors: [memory_limiter, batch, filter, resource]
+      exporters: [otlp/mimir, prometheus, awscloudwatch]
+
+    # Logs pipeline
+    logs:
+      receivers: [otlp]
+      processors: [memory_limiter, batch, resource, attributes]
+      exporters: [loki, elasticsearch, awscloudwatch]
+
+  # Telemetry (collector's own metrics)
+  telemetry:
+    logs:
+      level: info
+    metrics:
+      address: 0.0.0.0:8888
+
+# Notes:
+# 1. Replace ${DD_API_KEY} with actual API key or use environment variable
+# 2. Adjust endpoints to match your infrastructure
+# 3. Comment out exporters you don't use
+# 4. Adjust sampling rates based on your volume and needs
+# 5. Add TLS configuration for production deployments
--- a/assets/templates/prometheus-alerts/kubernetes-alerts.yml
+++ b/assets/templates/prometheus-alerts/kubernetes-alerts.yml
@@ -0,0 +1,293 @@
+---
+# Prometheus Alert Rules for Kubernetes
+# Covers pods, nodes, deployments, and resource usage
+
+groups:
+  - name: kubernetes_pods
+    interval: 30s
+    rules:
+      # Pod crash looping
+      - alert: PodCrashLooping
+        expr: |
+          rate(kube_pod_container_status_restarts_total[15m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Pod is crash looping - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes.
+
+            Check pod logs:
+            kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} --previous
+          runbook_url: "https://runbooks.example.com/pod-crash-loop"
+
+      # Pod not ready
+      - alert: PodNotReady
+        expr: |
+          sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Pod not ready - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Pod {{ $labels.namespace }}/{{ $labels.pod }} is in {{ $labels.phase }} state for 10 minutes.
+
+            Investigate:
+            kubectl describe pod -n {{ $labels.namespace }} {{ $labels.pod }}
+          runbook_url: "https://runbooks.example.com/pod-not-ready"
+
+      # Pod OOMKilled
+      - alert: PodOOMKilled
+        expr: |
+          sum by (namespace, pod) (kube_pod_container_status_terminated_reason{reason="OOMKilled"}) > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Pod killed due to OOM - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Pod {{ $labels.namespace }}/{{ $labels.pod }} was killed due to out-of-memory.
+
+            Increase memory limits or investigate memory leak.
+          runbook_url: "https://runbooks.example.com/oom-killed"
+
+  - name: kubernetes_deployments
+    interval: 30s
+    rules:
+      # Deployment replica mismatch
+      - alert: DeploymentReplicasMismatch
+        expr: |
+          kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+        for: 15m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Deployment replicas mismatch - {{ $labels.namespace }}/{{ $labels.deployment }}"
+          description: |
+            Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has been running with
+            fewer replicas than desired for 15 minutes.
+
+            Desired: {{ $value }}
+            Available: Check deployment status
+          runbook_url: "https://runbooks.example.com/replica-mismatch"
+
+      # Deployment rollout stuck
+      - alert: DeploymentRolloutStuck
+        expr: |
+          kube_deployment_status_condition{condition="Progressing", status="false"} > 0
+        for: 15m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Deployment rollout stuck - {{ $labels.namespace }}/{{ $labels.deployment }}"
+          description: |
+            Deployment {{ $labels.namespace }}/{{ $labels.deployment }} rollout is stuck.
+
+            Check rollout status:
+            kubectl rollout status deployment/{{ $labels.deployment }} -n {{ $labels.namespace }}
+          runbook_url: "https://runbooks.example.com/rollout-stuck"
+
+  - name: kubernetes_nodes
+    interval: 30s
+    rules:
+      # Node not ready
+      - alert: NodeNotReady
+        expr: |
+          kube_node_status_condition{condition="Ready",status="true"} == 0
+        for: 5m
+        labels:
+          severity: critical
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node not ready - {{ $labels.node }}"
+          description: |
+            Node {{ $labels.node }} has been NotReady for 5 minutes.
+
+            This will affect pod scheduling and availability.
+
+            Check node status:
+            kubectl describe node {{ $labels.node }}
+          runbook_url: "https://runbooks.example.com/node-not-ready"
+
+      # Node memory pressure
+      - alert: NodeMemoryPressure
+        expr: |
+          kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node under memory pressure - {{ $labels.node }}"
+          description: |
+            Node {{ $labels.node }} is experiencing memory pressure.
+
+            Pods may be evicted. Consider scaling up or evicting low-priority pods.
+          runbook_url: "https://runbooks.example.com/memory-pressure"
+
+      # Node disk pressure
+      - alert: NodeDiskPressure
+        expr: |
+          kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node under disk pressure - {{ $labels.node }}"
+          description: |
+            Node {{ $labels.node }} is experiencing disk pressure.
+
+            Clean up disk space or add capacity.
+          runbook_url: "https://runbooks.example.com/disk-pressure"
+
+      # Node high CPU
+      - alert: NodeHighCPU
+        expr: |
+          (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) * 100 > 80
+        for: 15m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Node high CPU usage - {{ $labels.instance }}"
+          description: |
+            Node {{ $labels.instance }} CPU usage is {{ $value | humanize }}%.
+
+            Check for resource-intensive pods or scale cluster.
+          runbook_url: "https://runbooks.example.com/node-high-cpu"
+
+  - name: kubernetes_resources
+    interval: 30s
+    rules:
+      # Container CPU throttling
+      - alert: ContainerCPUThrottling
+        expr: |
+          rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Container CPU throttling - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
+            is being CPU throttled.
+
+            CPU throttling rate: {{ $value | humanize }}
+
+            Consider increasing CPU limits.
+          runbook_url: "https://runbooks.example.com/cpu-throttling"
+
+      # Container memory usage high
+      - alert: ContainerMemoryUsageHigh
+        expr: |
+          (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Container memory usage high - {{ $labels.namespace }}/{{ $labels.pod }}"
+          description: |
+            Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
+            is using {{ $value | humanizePercentage }} of its memory limit.
+
+            Risk of OOMKill. Consider increasing memory limits.
+          runbook_url: "https://runbooks.example.com/high-memory"
+
+  - name: kubernetes_pv
+    interval: 30s
+    rules:
+      # PersistentVolume nearing full
+      - alert: PersistentVolumeFillingUp
+        expr: |
+          (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.15
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "PersistentVolume filling up - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
+          description: |
+            PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
+            is {{ $value | humanizePercentage }} full.
+
+            Available space is running low. Consider expanding volume.
+          runbook_url: "https://runbooks.example.com/pv-filling-up"
+
+      # PersistentVolume critically full
+      - alert: PersistentVolumeCriticallyFull
+        expr: |
+          (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) < 0.05
+        for: 5m
+        labels:
+          severity: critical
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "PersistentVolume critically full - {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}"
+          description: |
+            PersistentVolume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }}
+            is {{ $value | humanizePercentage }} full.
+
+            Immediate action required to prevent application failures.
+          runbook_url: "https://runbooks.example.com/pv-critically-full"
+
+  - name: kubernetes_jobs
+    interval: 30s
+    rules:
+      # Job failed
+      - alert: JobFailed
+        expr: |
+          kube_job_status_failed > 0
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "Job failed - {{ $labels.namespace }}/{{ $labels.job_name }}"
+          description: |
+            Job {{ $labels.namespace }}/{{ $labels.job_name }} has failed.
+
+            Check job logs:
+            kubectl logs job/{{ $labels.job_name }} -n {{ $labels.namespace }}
+          runbook_url: "https://runbooks.example.com/job-failed"
+
+      # CronJob not running
+      - alert: CronJobNotRunning
+        expr: |
+          time() - kube_cronjob_status_last_schedule_time > 3600
+        for: 10m
+        labels:
+          severity: warning
+          team: platform
+          component: kubernetes
+        annotations:
+          summary: "CronJob not running - {{ $labels.namespace }}/{{ $labels.cronjob }}"
+          description: |
+            CronJob {{ $labels.namespace}}/{{ $labels.cronjob }} hasn't run in over an hour.
+
+            Check CronJob status:
+            kubectl describe cronjob {{ $labels.cronjob }} -n {{ $labels.namespace }}
+          runbook_url: "https://runbooks.example.com/cronjob-not-running"
--- a/assets/templates/prometheus-alerts/webapp-alerts.yml
+++ b/assets/templates/prometheus-alerts/webapp-alerts.yml
@@ -0,0 +1,243 @@
+---
+# Prometheus Alert Rules for Web Applications
+# Based on SLO best practices and multi-window burn rate alerting
+
+groups:
+  - name: webapp_availability
+    interval: 30s
+    rules:
+      # Fast burn rate alert (1h window) - SLO: 99.9%
+      - alert: ErrorBudgetFastBurn
+        expr: |
+          (
+            sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h]))
+            /
+            sum(rate(http_requests_total{job="webapp"}[1h]))
+          ) > (14.4 * 0.001)
+        for: 2m
+        labels:
+          severity: critical
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Fast error budget burn - {{ $labels.job }}"
+          description: |
+            Error rate is {{ $value | humanizePercentage }} over the last hour,
+            burning through error budget at 14.4x rate.
+
+            At this rate, the monthly error budget will be exhausted in 2 days.
+
+            Immediate investigation required.
+          runbook_url: "https://runbooks.example.com/error-budget-burn"
+          dashboard: "https://grafana.example.com/d/webapp"
+
+      # Slow burn rate alert (6h window)
+      - alert: ErrorBudgetSlowBurn
+        expr: |
+          (
+            sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h]))
+            /
+            sum(rate(http_requests_total{job="webapp"}[6h]))
+          ) > (6 * 0.001)
+        for: 30m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Elevated error budget burn - {{ $labels.job }}"
+          description: |
+            Error rate is {{ $value | humanizePercentage }} over the last 6 hours,
+            burning through error budget at 6x rate.
+
+            Monitor closely and investigate if trend continues.
+          runbook_url: "https://runbooks.example.com/error-budget-burn"
+
+      # Service down alert
+      - alert: WebAppDown
+        expr: up{job="webapp"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Web application is down - {{ $labels.instance }}"
+          description: |
+            Web application instance {{ $labels.instance }} has been down for 2 minutes.
+
+            Check service health and logs immediately.
+          runbook_url: "https://runbooks.example.com/service-down"
+
+  - name: webapp_latency
+    interval: 30s
+    rules:
+      # High latency (p95)
+      - alert: HighLatencyP95
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
+          ) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "High p95 latency - {{ $labels.job }}"
+          description: |
+            P95 request latency is {{ $value }}s, exceeding 500ms threshold.
+
+            This may impact user experience. Check for:
+            - Slow database queries
+            - External API issues
+            - Resource saturation
+          runbook_url: "https://runbooks.example.com/high-latency"
+          dashboard: "https://grafana.example.com/d/webapp-latency"
+
+      # Very high latency (p99)
+      - alert: HighLatencyP99
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
+          ) > 2
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Critical latency degradation - {{ $labels.job }}"
+          description: |
+            P99 request latency is {{ $value }}s, exceeding 2s threshold.
+
+            Severe performance degradation detected.
+          runbook_url: "https://runbooks.example.com/high-latency"
+
+  - name: webapp_resources
+    interval: 30s
+    rules:
+      # High CPU
+      - alert: HighCPU
+        expr: |
+          rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80
+        for: 15m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "High CPU usage - {{ $labels.instance }}"
+          description: |
+            CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}.
+
+            Consider scaling up or investigating CPU-intensive operations.
+          runbook_url: "https://runbooks.example.com/high-cpu"
+
+      # High memory
+      - alert: HighMemory
+        expr: |
+          (process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80
+        for: 15m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "High memory usage - {{ $labels.instance }}"
+          description: |
+            Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}.
+
+            Check for memory leaks or consider scaling up.
+          runbook_url: "https://runbooks.example.com/high-memory"
+
+  - name: webapp_traffic
+    interval: 30s
+    rules:
+      # Traffic spike
+      - alert: TrafficSpike
+        expr: |
+          sum(rate(http_requests_total{job="webapp"}[5m]))
+          >
+          1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Traffic spike detected - {{ $labels.job }}"
+          description: |
+            Request rate increased by 50% compared to 1 hour ago.
+
+            Current: {{ $value | humanize }} req/s
+
+            This could be:
+            - Legitimate traffic increase
+            - DDoS attack
+            - Retry storm
+
+            Monitor closely and be ready to scale.
+          runbook_url: "https://runbooks.example.com/traffic-spike"
+
+      # Traffic drop (potential issue)
+      - alert: TrafficDrop
+        expr: |
+          sum(rate(http_requests_total{job="webapp"}[5m]))
+          <
+          0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+          component: webapp
+        annotations:
+          summary: "Traffic drop detected - {{ $labels.job }}"
+          description: |
+            Request rate dropped by 50% compared to 1 hour ago.
+
+            This could indicate:
+            - Upstream service issue
+            - DNS problems
+            - Load balancer misconfiguration
+          runbook_url: "https://runbooks.example.com/traffic-drop"
+
+  - name: webapp_dependencies
+    interval: 30s
+    rules:
+      # Database connection pool exhaustion
+      - alert: DatabasePoolExhausted
+        expr: |
+          (db_connection_pool_active / db_connection_pool_max) > 0.9
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+          component: database
+        annotations:
+          summary: "Database connection pool near exhaustion"
+          description: |
+            Connection pool is {{ $value | humanizePercentage }} full.
+
+            This will cause request failures. Immediate action required.
+          runbook_url: "https://runbooks.example.com/db-pool-exhausted"
+
+      # External API errors
+      - alert: ExternalAPIErrors
+        expr: |
+          sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api)
+          /
+          sum(rate(external_api_requests_total[5m])) by (api)
+          > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          team: backend
+          component: integration
+        annotations:
+          summary: "High error rate from external API - {{ $labels.api }}"
+          description: |
+            {{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate.
+
+            Check API status page and consider enabling circuit breaker.
+          runbook_url: "https://runbooks.example.com/external-api-errors"
--- a/assets/templates/runbooks/incident-runbook-template.md
+++ b/assets/templates/runbooks/incident-runbook-template.md
@@ -0,0 +1,409 @@
+# Runbook: [Alert Name]
+
+## Overview
+
+**Alert Name**: [e.g., HighLatency, ServiceDown, ErrorBudgetBurn]
+
+**Severity**: [Critical | Warning | Info]
+
+**Team**: [e.g., Backend, Platform, Database]
+
+**Component**: [e.g., API Gateway, User Service, PostgreSQL]
+
+**What it means**: [One-line description of what this alert indicates]
+
+**User impact**: [How does this affect users? High/Medium/Low]
+
+**Urgency**: [How quickly must this be addressed? Immediate/Hours/Days]
+
+---
+
+## Alert Details
+
+### When This Alert Fires
+
+This alert fires when:
+- [Specific condition, e.g., "P95 latency exceeds 500ms for 10 minutes"]
+- [Any additional conditions]
+
+### Symptoms
+
+Users will experience:
+- [ ] Slow response times
+- [ ] Errors or failures
+- [ ] Service unavailable
+- [ ] [Other symptoms]
+
+### Probable Causes
+
+Common causes include:
+1. **[Cause 1]**: [Description]
+   - Example: Database overload due to slow queries
+2. **[Cause 2]**: [Description]
+   - Example: Memory leak causing OOM errors
+3. **[Cause 3]**: [Description]
+   - Example: Upstream service degradation
+
+---
+
+## Investigation Steps
+
+### 1. Check Service Health
+
+**Dashboard**: [Link to primary dashboard]
+
+**Key metrics to check**:
+```bash
+# Request rate
+sum(rate(http_requests_total[5m]))
+
+# Error rate
+sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))
+
+# Latency (p95, p99)
+histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
+```
+
+**What to look for**:
+- [ ] Has traffic spiked recently?
+- [ ] Is error rate elevated?
+- [ ] Are any endpoints particularly slow?
+
+### 2. Check Recent Changes
+
+**Deployments**:
+```bash
+# Kubernetes
+kubectl rollout history deployment/[service-name] -n [namespace]
+
+# Check when last deployed
+kubectl get pods -n [namespace] -o wide | grep [service-name]
+```
+
+**What to look for**:
+- [ ] Was there a recent deployment?
+- [ ] Did alert start after deployment?
+- [ ] Any configuration changes?
+
+### 3. Check Logs
+
+**Log query** (adjust for your log system):
+```bash
+# Kubernetes
+kubectl logs deployment/[service-name] -n [namespace] --tail=100 | grep ERROR
+
+# Elasticsearch/Kibana
+GET /logs-*/_search
+{
+  "query": {
+    "bool": {
+      "must": [
+        { "match": { "service": "[service-name]" } },
+        { "match": { "level": "error" } },
+        { "range": { "@timestamp": { "gte": "now-30m" } } }
+      ]
+    }
+  }
+}
+
+# Loki/LogQL
+{job="[service-name]"} |= "error" | json | level="error"
+```
+
+**What to look for**:
+- [ ] Repeated error messages
+- [ ] Stack traces
+- [ ] Connection errors
+- [ ] Timeout errors
+
+### 4. Check Dependencies
+
+**Database**:
+```bash
+# Check active connections
+SELECT count(*) FROM pg_stat_activity WHERE state = 'active';
+
+# Check slow queries
+SELECT pid, now() - pg_stat_activity.query_start AS duration, query
+FROM pg_stat_activity
+WHERE state = 'active' AND now() - pg_stat_activity.query_start > interval '5 seconds';
+```
+
+**External APIs**:
+- [ ] Check status pages: [Link to status pages]
+- [ ] Check API error rates in dashboard
+- [ ] Test API endpoints manually
+
+**Cache** (Redis/Memcached):
+```bash
+# Redis info
+redis-cli -h [host] INFO stats
+
+# Check memory usage
+redis-cli -h [host] INFO memory
+```
+
+### 5. Check Resource Usage
+
+**CPU and Memory**:
+```bash
+# Kubernetes
+kubectl top pods -n [namespace] | grep [service-name]
+
+# Node metrics
+kubectl top nodes
+```
+
+**Prometheus queries**:
+```promql
+# CPU usage by pod
+sum(rate(container_cpu_usage_seconds_total{pod=~"[service-name].*"}[5m])) by (pod)
+
+# Memory usage by pod
+sum(container_memory_usage_bytes{pod=~"[service-name].*"}) by (pod)
+```
+
+**What to look for**:
+- [ ] CPU throttling
+- [ ] Memory approaching limits
+- [ ] Disk space issues
+
+### 6. Check Traces (if available)
+
+**Trace query**:
+```bash
+# Jaeger
+# Search for slow traces (> 1s) in last 30 minutes
+
+# Tempo/TraceQL
+{ duration > 1s && resource.service.name = "[service-name]" }
+```
+
+**What to look for**:
+- [ ] Which operation is slow?
+- [ ] Where is time spent? (DB, external API, service logic)
+- [ ] Any N+1 query patterns?
+
+---
+
+## Common Scenarios and Solutions
+
+### Scenario 1: Recent Deployment Caused Issue
+
+**Symptoms**:
+- Alert started immediately after deployment
+- Error logs correlate with new code
+
+**Solution**:
+```bash
+# Rollback deployment
+kubectl rollout undo deployment/[service-name] -n [namespace]
+
+# Verify rollback succeeded
+kubectl rollout status deployment/[service-name] -n [namespace]
+
+# Monitor for alert resolution
+```
+
+**Follow-up**:
+- [ ] Create incident report
+- [ ] Review deployment process
+- [ ] Add pre-deployment checks
+
+### Scenario 2: Database Performance Issue
+
+**Symptoms**:
+- Slow query logs show problematic queries
+- Database CPU or connection pool exhausted
+
+**Solution**:
+```bash
+# Identify slow query
+# Kill long-running query (use with caution)
+SELECT pg_cancel_backend([pid]);
+
+# Or terminate if cancel doesn't work
+SELECT pg_terminate_backend([pid]);
+
+# Add index if missing (in maintenance window)
+CREATE INDEX CONCURRENTLY idx_name ON table_name (column_name);
+```
+
+**Follow-up**:
+- [ ] Add query performance test
+- [ ] Review and optimize query
+- [ ] Consider read replicas
+
+### Scenario 3: Memory Leak
+
+**Symptoms**:
+- Memory usage gradually increasing
+- Eventually OOMKilled
+- Restarts temporarily fix issue
+
+**Solution**:
+```bash
+# Immediate: Restart pods
+kubectl rollout restart deployment/[service-name] -n [namespace]
+
+# Increase memory limits (temporary)
+kubectl set resources deployment/[service-name] -n [namespace] \
+  --limits=memory=2Gi
+```
+
+**Follow-up**:
+- [ ] Profile application for memory leaks
+- [ ] Add memory usage alerts
+- [ ] Fix root cause
+
+### Scenario 4: Traffic Spike / DDoS
+
+**Symptoms**:
+- Sudden traffic increase
+- Traffic from unusual sources
+- High CPU/memory across all instances
+
+**Solution**:
+```bash
+# Scale up immediately
+kubectl scale deployment/[service-name] -n [namespace] --replicas=10
+
+# Enable rate limiting at load balancer level
+# (Specific steps depend on LB)
+
+# Block suspicious IPs if confirmed DDoS
+# (Use WAF or network policies)
+```
+
+**Follow-up**:
+- [ ] Implement rate limiting
+- [ ] Add DDoS protection (CloudFlare, WAF)
+- [ ] Set up auto-scaling
+
+### Scenario 5: Upstream Service Degradation
+
+**Symptoms**:
+- Errors calling external API
+- Timeouts to upstream service
+- Upstream status page shows issues
+
+**Solution**:
+```bash
+# Enable circuit breaker (if available)
+# Adjust timeout configuration
+# Switch to backup service/cached data
+
+# Monitor external service
+# Check status page: [Link]
+```
+
+**Follow-up**:
+- [ ] Implement circuit breaker pattern
+- [ ] Add fallback mechanisms
+- [ ] Set up external service monitoring
+
+---
+
+## Immediate Actions (< 5 minutes)
+
+These should be done first to mitigate impact:
+
+1. **[Action 1]**: [e.g., "Scale up service"]
+   ```bash
+   kubectl scale deployment/[service] --replicas=10
+   ```
+
+2. **[Action 2]**: [e.g., "Rollback deployment"]
+   ```bash
+   kubectl rollout undo deployment/[service]
+   ```
+
+3. **[Action 3]**: [e.g., "Enable circuit breaker"]
+
+---
+
+## Short-term Actions (< 30 minutes)
+
+After immediate mitigation:
+
+1. **[Action 1]**: [e.g., "Investigate root cause"]
+2. **[Action 2]**: [e.g., "Optimize slow query"]
+3. **[Action 3]**: [e.g., "Clear cache if stale"]
+
+---
+
+## Long-term Actions (Post-Incident)
+
+Preventive measures:
+
+1. **[Action 1]**: [e.g., "Add circuit breaker"]
+2. **[Action 2]**: [e.g., "Implement auto-scaling"]
+3. **[Action 3]**: [e.g., "Add query performance tests"]
+4. **[Action 4]**: [e.g., "Update alert thresholds"]
+
+---
+
+## Escalation
+
+If issue persists after 30 minutes:
+
+**Escalation Path**:
+1. **Primary oncall**: @[username] ([slack/email])
+2. **Team lead**: @[username] ([slack/email])
+3. **Engineering manager**: @[username] ([slack/email])
+4. **Incident commander**: @[username] ([slack/email])
+
+**Communication**:
+- **Slack channel**: #[incidents-channel]
+- **Status page**: [Link]
+- **Incident tracking**: [Link to incident management tool]
+
+---
+
+## Related Runbooks
+
+- [Related Runbook 1]
+- [Related Runbook 2]
+- [Related Runbook 3]
+
+## Related Dashboards
+
+- [Main Service Dashboard]
+- [Resource Usage Dashboard]
+- [Dependency Dashboard]
+
+## Related Documentation
+
+- [Architecture Diagram]
+- [Service Documentation]
+- [API Documentation]
+
+---
+
+## Recent Incidents
+
+| Date | Duration | Root Cause | Resolution | Ticket |
+|------|----------|------------|------------|--------|
+| 2024-10-15 | 23 min | Database pool exhausted | Increased pool size | INC-123 |
+| 2024-09-30 | 45 min | Memory leak | Fixed code, restarted | INC-120 |
+
+---
+
+## Runbook Metadata
+
+**Last Updated**: [Date]
+
+**Owner**: [Team name]
+
+**Reviewers**: [Names]
+
+**Next Review**: [Date]
+
+---
+
+## Notes
+
+- This runbook should be reviewed quarterly
+- Update after each incident to capture new learnings
+- Keep investigation steps concise and actionable
+- Include actual commands that can be copy-pasted