gh-ahmedasmar-devops-claude…/scripts/slo_calculator.py

#!/usr/bin/env python3
"""
Calculate SLO compliance, error budgets, and burn rates.
Supports availability SLOs and latency SLOs.
"""

import argparse
import sys
from datetime import datetime, timedelta
from typing import Dict, Any, Optional

try:
    from tabulate import tabulate
except ImportError:
    print("⚠️  Warning: 'tabulate' library not found. Install with: pip install tabulate")
    tabulate = None


class SLOCalculator:
    # SLO targets and allowed downtime per period
    SLO_TARGETS = {
        "90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1},  # days
        "95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05},
        "99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01},
        "99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005},
        "99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001},
        "99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005},
        "99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001},
    }

    def __init__(self, slo_target: float, period_days: int = 30):
        """
        Initialize SLO calculator.

        Args:
            slo_target: SLO target percentage (e.g., 99.9)
            period_days: Time period in days (default: 30)
        """
        self.slo_target = slo_target
        self.period_days = period_days
        self.error_budget_minutes = self.calculate_error_budget_minutes()

    def calculate_error_budget_minutes(self) -> float:
        """Calculate error budget in minutes for the period."""
        total_minutes = self.period_days * 24 * 60
        allowed_error_rate = (100 - self.slo_target) / 100
        return total_minutes * allowed_error_rate

    def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]:
        """
        Calculate availability SLO compliance.

        Args:
            total_requests: Total number of requests
            failed_requests: Number of failed requests

        Returns:
            Dict with SLO compliance metrics
        """
        if total_requests == 0:
            return {
                "error": "No requests in the period",
                "slo_met": False
            }

        success_rate = ((total_requests - failed_requests) / total_requests) * 100
        error_rate = (failed_requests / total_requests) * 100

        # Calculate error budget consumption
        allowed_failures = total_requests * ((100 - self.slo_target) / 100)
        error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf')
        error_budget_remaining = max(0, 100 - error_budget_consumed)

        # Determine if SLO is met
        slo_met = success_rate >= self.slo_target

        return {
            "slo_target": self.slo_target,
            "period_days": self.period_days,
            "total_requests": total_requests,
            "failed_requests": failed_requests,
            "success_requests": total_requests - failed_requests,
            "success_rate": success_rate,
            "error_rate": error_rate,
            "slo_met": slo_met,
            "error_budget_total": allowed_failures,
            "error_budget_consumed": error_budget_consumed,
            "error_budget_remaining": error_budget_remaining,
            "margin": success_rate - self.slo_target
        }

    def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]:
        """
        Calculate latency SLO compliance.

        Args:
            total_requests: Total number of requests
            requests_exceeding_threshold: Number of requests exceeding latency threshold

        Returns:
            Dict with SLO compliance metrics
        """
        if total_requests == 0:
            return {
                "error": "No requests in the period",
                "slo_met": False
            }

        within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100

        # Calculate error budget consumption
        allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100)
        error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf')
        error_budget_remaining = max(0, 100 - error_budget_consumed)

        slo_met = within_threshold_rate >= self.slo_target

        return {
            "slo_target": self.slo_target,
            "period_days": self.period_days,
            "total_requests": total_requests,
            "requests_exceeding_threshold": requests_exceeding_threshold,
            "requests_within_threshold": total_requests - requests_exceeding_threshold,
            "within_threshold_rate": within_threshold_rate,
            "slo_met": slo_met,
            "error_budget_total": allowed_slow_requests,
            "error_budget_consumed": error_budget_consumed,
            "error_budget_remaining": error_budget_remaining,
            "margin": within_threshold_rate - self.slo_target
        }

    def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]:
        """
        Calculate error budget burn rate.

        Args:
            errors_in_window: Number of errors in the time window
            requests_in_window: Total requests in the time window
            window_hours: Size of the time window in hours

        Returns:
            Dict with burn rate metrics
        """
        if requests_in_window == 0:
            return {"error": "No requests in window"}

        # Calculate actual error rate in this window
        actual_error_rate = (errors_in_window / requests_in_window) * 100

        # Calculate allowed error rate for SLO
        allowed_error_rate = 100 - self.slo_target

        # Burn rate = actual error rate / allowed error rate
        burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf')

        # Calculate time to exhaustion
        if burn_rate > 0:
            error_budget_hours = self.error_budget_minutes / 60
            hours_to_exhaustion = error_budget_hours / burn_rate
        else:
            hours_to_exhaustion = float('inf')

        # Determine severity
        if burn_rate >= 14.4:  # 1 hour window, burns budget in 2 days
            severity = "critical"
        elif burn_rate >= 6:  # 6 hour window, burns budget in 5 days
            severity = "warning"
        elif burn_rate >= 1:
            severity = "elevated"
        else:
            severity = "normal"

        return {
            "window_hours": window_hours,
            "requests_in_window": requests_in_window,
            "errors_in_window": errors_in_window,
            "actual_error_rate": actual_error_rate,
            "allowed_error_rate": allowed_error_rate,
            "burn_rate": burn_rate,
            "hours_to_exhaustion": hours_to_exhaustion,
            "severity": severity
        }

    @staticmethod
    def print_slo_table():
        """Print table of common SLO targets and allowed downtime."""
        if not tabulate:
            print("Install tabulate for formatted output: pip install tabulate")
            return

        print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME")
        print("="*60)

        headers = ["SLO", "Year", "Month", "Week", "Day"]
        rows = []

        for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True):
            row = [
                f"{slo}%",
                f"{downtimes['year']:.2f} days",
                f"{downtimes['month']:.2f} days",
                f"{downtimes['week']:.2f} days",
                f"{downtimes['day']:.2f} days"
            ]
            rows.append(row)

        print(tabulate(rows, headers=headers, tablefmt="grid"))


def print_availability_results(results: Dict[str, Any]):
    """Print availability SLO results."""
    print("\n" + "="*60)
    print("📊 AVAILABILITY SLO COMPLIANCE")
    print("="*60)

    if "error" in results:
        print(f"\n❌ Error: {results['error']}")
        return

    status_emoji = "✅" if results['slo_met'] else "❌"
    print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}")
    print(f"   Target: {results['slo_target']}%")
    print(f"   Actual: {results['success_rate']:.3f}%")
    print(f"   Margin: {results['margin']:+.3f}%")

    print(f"\n📈 Request Statistics:")
    print(f"   Total Requests: {results['total_requests']:,}")
    print(f"   Successful: {results['success_requests']:,}")
    print(f"   Failed: {results['failed_requests']:,}")
    print(f"   Error Rate: {results['error_rate']:.3f}%")

    print(f"\n💰 Error Budget:")
    budget_emoji = "✅" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else "❌"
    print(f"   {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%")
    print(f"   Consumed: {results['error_budget_consumed']:.1f}%")
    print(f"   Allowed Failures: {results['error_budget_total']:.0f}")

    print("\n" + "="*60)


def print_burn_rate_results(results: Dict[str, Any]):
    """Print burn rate results."""
    print("\n" + "="*60)
    print("🔥 ERROR BUDGET BURN RATE")
    print("="*60)

    if "error" in results:
        print(f"\n❌ Error: {results['error']}")
        return

    severity_emoji = {
        "critical": "🔴",
        "warning": "🟡",
        "elevated": "🟠",
        "normal": "🟢"
    }

    print(f"\n{severity_emoji.get(results['severity'], '❓')} Severity: {results['severity'].upper()}")
    print(f"   Burn Rate: {results['burn_rate']:.2f}x")
    print(f"   Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)")

    print(f"\n📊 Window Statistics:")
    print(f"   Window: {results['window_hours']} hours")
    print(f"   Requests: {results['requests_in_window']:,}")
    print(f"   Errors: {results['errors_in_window']:,}")
    print(f"   Actual Error Rate: {results['actual_error_rate']:.3f}%")
    print(f"   Allowed Error Rate: {results['allowed_error_rate']:.3f}%")

    print("\n" + "="*60)


def main():
    parser = argparse.ArgumentParser(
        description="Calculate SLO compliance and error budgets",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Show SLO reference table
  python3 slo_calculator.py --table

  # Calculate availability SLO
  python3 slo_calculator.py availability \\
    --slo 99.9 \\
    --total-requests 1000000 \\
    --failed-requests 1500 \\
    --period-days 30

  # Calculate latency SLO
  python3 slo_calculator.py latency \\
    --slo 99.5 \\
    --total-requests 500000 \\
    --slow-requests 3000 \\
    --period-days 7

  # Calculate burn rate
  python3 slo_calculator.py burn-rate \\
    --slo 99.9 \\
    --errors 50 \\
    --requests 10000 \\
    --window-hours 1
        """
    )

    parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'],
                       help='Calculation mode')
    parser.add_argument('--table', action='store_true', help='Show SLO reference table')
    parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)')
    parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)')

    # Availability SLO arguments
    parser.add_argument('--total-requests', type=int, help='Total number of requests')
    parser.add_argument('--failed-requests', type=int, help='Number of failed requests')

    # Latency SLO arguments
    parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold')

    # Burn rate arguments
    parser.add_argument('--errors', type=int, help='Number of errors in window')
    parser.add_argument('--requests', type=int, help='Number of requests in window')
    parser.add_argument('--window-hours', type=float, help='Window size in hours')

    args = parser.parse_args()

    # Show table if requested
    if args.table:
        SLOCalculator.print_slo_table()
        return

    if not args.mode:
        parser.print_help()
        return

    if not args.slo:
        print("❌ --slo required")
        sys.exit(1)

    calculator = SLOCalculator(args.slo, args.period_days)

    if args.mode == 'availability':
        if not args.total_requests or args.failed_requests is None:
            print("❌ --total-requests and --failed-requests required")
            sys.exit(1)

        results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests)
        print_availability_results(results)

    elif args.mode == 'latency':
        if not args.total_requests or args.slow_requests is None:
            print("❌ --total-requests and --slow-requests required")
            sys.exit(1)

        results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests)
        print_availability_results(results)  # Same format

    elif args.mode == 'burn-rate':
        if not all([args.errors is not None, args.requests, args.window_hours]):
            print("❌ --errors, --requests, and --window-hours required")
            sys.exit(1)

        results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours)
        print_burn_rate_results(results)


if __name__ == "__main__":
    main()