gh-ahmedasmar-devops-claude…/assets/templates/prometheus-alerts/webapp-alerts.yml

---
# Prometheus Alert Rules for Web Applications
# Based on SLO best practices and multi-window burn rate alerting

groups:
  - name: webapp_availability
    interval: 30s
    rules:
      # Fast burn rate alert (1h window) - SLO: 99.9%
      - alert: ErrorBudgetFastBurn
        expr: |
          (
            sum(rate(http_requests_total{job="webapp",status=~"5.."}[1h]))
            /
            sum(rate(http_requests_total{job="webapp"}[1h]))
          ) > (14.4 * 0.001)
        for: 2m
        labels:
          severity: critical
          team: backend
          component: webapp
        annotations:
          summary: "Fast error budget burn - {{ $labels.job }}"
          description: |
            Error rate is {{ $value | humanizePercentage }} over the last hour,
            burning through error budget at 14.4x rate.

            At this rate, the monthly error budget will be exhausted in 2 days.

            Immediate investigation required.
          runbook_url: "https://runbooks.example.com/error-budget-burn"
          dashboard: "https://grafana.example.com/d/webapp"

      # Slow burn rate alert (6h window)
      - alert: ErrorBudgetSlowBurn
        expr: |
          (
            sum(rate(http_requests_total{job="webapp",status=~"5.."}[6h]))
            /
            sum(rate(http_requests_total{job="webapp"}[6h]))
          ) > (6 * 0.001)
        for: 30m
        labels:
          severity: warning
          team: backend
          component: webapp
        annotations:
          summary: "Elevated error budget burn - {{ $labels.job }}"
          description: |
            Error rate is {{ $value | humanizePercentage }} over the last 6 hours,
            burning through error budget at 6x rate.

            Monitor closely and investigate if trend continues.
          runbook_url: "https://runbooks.example.com/error-budget-burn"

      # Service down alert
      - alert: WebAppDown
        expr: up{job="webapp"} == 0
        for: 2m
        labels:
          severity: critical
          team: backend
          component: webapp
        annotations:
          summary: "Web application is down - {{ $labels.instance }}"
          description: |
            Web application instance {{ $labels.instance }} has been down for 2 minutes.

            Check service health and logs immediately.
          runbook_url: "https://runbooks.example.com/service-down"

  - name: webapp_latency
    interval: 30s
    rules:
      # High latency (p95)
      - alert: HighLatencyP95
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
          ) > 0.5
        for: 10m
        labels:
          severity: warning
          team: backend
          component: webapp
        annotations:
          summary: "High p95 latency - {{ $labels.job }}"
          description: |
            P95 request latency is {{ $value }}s, exceeding 500ms threshold.

            This may impact user experience. Check for:
            - Slow database queries
            - External API issues
            - Resource saturation
          runbook_url: "https://runbooks.example.com/high-latency"
          dashboard: "https://grafana.example.com/d/webapp-latency"

      # Very high latency (p99)
      - alert: HighLatencyP99
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket{job="webapp"}[5m])) by (le)
          ) > 2
        for: 5m
        labels:
          severity: critical
          team: backend
          component: webapp
        annotations:
          summary: "Critical latency degradation - {{ $labels.job }}"
          description: |
            P99 request latency is {{ $value }}s, exceeding 2s threshold.

            Severe performance degradation detected.
          runbook_url: "https://runbooks.example.com/high-latency"

  - name: webapp_resources
    interval: 30s
    rules:
      # High CPU
      - alert: HighCPU
        expr: |
          rate(process_cpu_seconds_total{job="webapp"}[5m]) * 100 > 80
        for: 15m
        labels:
          severity: warning
          team: backend
          component: webapp
        annotations:
          summary: "High CPU usage - {{ $labels.instance }}"
          description: |
            CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}.

            Consider scaling up or investigating CPU-intensive operations.
          runbook_url: "https://runbooks.example.com/high-cpu"

      # High memory
      - alert: HighMemory
        expr: |
          (process_resident_memory_bytes{job="webapp"} / node_memory_MemTotal_bytes) * 100 > 80
        for: 15m
        labels:
          severity: warning
          team: backend
          component: webapp
        annotations:
          summary: "High memory usage - {{ $labels.instance }}"
          description: |
            Memory usage is {{ $value | humanize }}% on {{ $labels.instance }}.

            Check for memory leaks or consider scaling up.
          runbook_url: "https://runbooks.example.com/high-memory"

  - name: webapp_traffic
    interval: 30s
    rules:
      # Traffic spike
      - alert: TrafficSpike
        expr: |
          sum(rate(http_requests_total{job="webapp"}[5m]))
          >
          1.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
        for: 10m
        labels:
          severity: warning
          team: backend
          component: webapp
        annotations:
          summary: "Traffic spike detected - {{ $labels.job }}"
          description: |
            Request rate increased by 50% compared to 1 hour ago.

            Current: {{ $value | humanize }} req/s

            This could be:
            - Legitimate traffic increase
            - DDoS attack
            - Retry storm

            Monitor closely and be ready to scale.
          runbook_url: "https://runbooks.example.com/traffic-spike"

      # Traffic drop (potential issue)
      - alert: TrafficDrop
        expr: |
          sum(rate(http_requests_total{job="webapp"}[5m]))
          <
          0.5 * sum(rate(http_requests_total{job="webapp"}[5m] offset 1h))
        for: 10m
        labels:
          severity: warning
          team: backend
          component: webapp
        annotations:
          summary: "Traffic drop detected - {{ $labels.job }}"
          description: |
            Request rate dropped by 50% compared to 1 hour ago.

            This could indicate:
            - Upstream service issue
            - DNS problems
            - Load balancer misconfiguration
          runbook_url: "https://runbooks.example.com/traffic-drop"

  - name: webapp_dependencies
    interval: 30s
    rules:
      # Database connection pool exhaustion
      - alert: DatabasePoolExhausted
        expr: |
          (db_connection_pool_active / db_connection_pool_max) > 0.9
        for: 5m
        labels:
          severity: critical
          team: backend
          component: database
        annotations:
          summary: "Database connection pool near exhaustion"
          description: |
            Connection pool is {{ $value | humanizePercentage }} full.

            This will cause request failures. Immediate action required.
          runbook_url: "https://runbooks.example.com/db-pool-exhausted"

      # External API errors
      - alert: ExternalAPIErrors
        expr: |
          sum(rate(external_api_requests_total{status=~"5.."}[5m])) by (api)
          /
          sum(rate(external_api_requests_total[5m])) by (api)
          > 0.1
        for: 5m
        labels:
          severity: warning
          team: backend
          component: integration
        annotations:
          summary: "High error rate from external API - {{ $labels.api }}"
          description: |
            {{ $labels.api }} is returning errors at {{ $value | humanizePercentage }} rate.

            Check API status page and consider enabling circuit breaker.
          runbook_url: "https://runbooks.example.com/external-api-errors"