Initial commit

2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions
--- a/scripts/slo_calculator.py
+++ b/scripts/slo_calculator.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+"""
+Calculate SLO compliance, error budgets, and burn rates.
+Supports availability SLOs and latency SLOs.
+"""
+
+import argparse
+import sys
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    print("⚠️  Warning: 'tabulate' library not found. Install with: pip install tabulate")
+    tabulate = None
+
+
+class SLOCalculator:
+    # SLO targets and allowed downtime per period
+    SLO_TARGETS = {
+        "90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1},  # days
+        "95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05},
+        "99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01},
+        "99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005},
+        "99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001},
+        "99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005},
+        "99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001},
+    }
+
+    def __init__(self, slo_target: float, period_days: int = 30):
+        """
+        Initialize SLO calculator.
+
+        Args:
+            slo_target: SLO target percentage (e.g., 99.9)
+            period_days: Time period in days (default: 30)
+        """
+        self.slo_target = slo_target
+        self.period_days = period_days
+        self.error_budget_minutes = self.calculate_error_budget_minutes()
+
+    def calculate_error_budget_minutes(self) -> float:
+        """Calculate error budget in minutes for the period."""
+        total_minutes = self.period_days * 24 * 60
+        allowed_error_rate = (100 - self.slo_target) / 100
+        return total_minutes * allowed_error_rate
+
+    def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]:
+        """
+        Calculate availability SLO compliance.
+
+        Args:
+            total_requests: Total number of requests
+            failed_requests: Number of failed requests
+
+        Returns:
+            Dict with SLO compliance metrics
+        """
+        if total_requests == 0:
+            return {
+                "error": "No requests in the period",
+                "slo_met": False
+            }
+
+        success_rate = ((total_requests - failed_requests) / total_requests) * 100
+        error_rate = (failed_requests / total_requests) * 100
+
+        # Calculate error budget consumption
+        allowed_failures = total_requests * ((100 - self.slo_target) / 100)
+        error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf')
+        error_budget_remaining = max(0, 100 - error_budget_consumed)
+
+        # Determine if SLO is met
+        slo_met = success_rate >= self.slo_target
+
+        return {
+            "slo_target": self.slo_target,
+            "period_days": self.period_days,
+            "total_requests": total_requests,
+            "failed_requests": failed_requests,
+            "success_requests": total_requests - failed_requests,
+            "success_rate": success_rate,
+            "error_rate": error_rate,
+            "slo_met": slo_met,
+            "error_budget_total": allowed_failures,
+            "error_budget_consumed": error_budget_consumed,
+            "error_budget_remaining": error_budget_remaining,
+            "margin": success_rate - self.slo_target
+        }
+
+    def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]:
+        """
+        Calculate latency SLO compliance.
+
+        Args:
+            total_requests: Total number of requests
+            requests_exceeding_threshold: Number of requests exceeding latency threshold
+
+        Returns:
+            Dict with SLO compliance metrics
+        """
+        if total_requests == 0:
+            return {
+                "error": "No requests in the period",
+                "slo_met": False
+            }
+
+        within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100
+
+        # Calculate error budget consumption
+        allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100)
+        error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf')
+        error_budget_remaining = max(0, 100 - error_budget_consumed)
+
+        slo_met = within_threshold_rate >= self.slo_target
+
+        return {
+            "slo_target": self.slo_target,
+            "period_days": self.period_days,
+            "total_requests": total_requests,
+            "requests_exceeding_threshold": requests_exceeding_threshold,
+            "requests_within_threshold": total_requests - requests_exceeding_threshold,
+            "within_threshold_rate": within_threshold_rate,
+            "slo_met": slo_met,
+            "error_budget_total": allowed_slow_requests,
+            "error_budget_consumed": error_budget_consumed,
+            "error_budget_remaining": error_budget_remaining,
+            "margin": within_threshold_rate - self.slo_target
+        }
+
+    def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]:
+        """
+        Calculate error budget burn rate.
+
+        Args:
+            errors_in_window: Number of errors in the time window
+            requests_in_window: Total requests in the time window
+            window_hours: Size of the time window in hours
+
+        Returns:
+            Dict with burn rate metrics
+        """
+        if requests_in_window == 0:
+            return {"error": "No requests in window"}
+
+        # Calculate actual error rate in this window
+        actual_error_rate = (errors_in_window / requests_in_window) * 100
+
+        # Calculate allowed error rate for SLO
+        allowed_error_rate = 100 - self.slo_target
+
+        # Burn rate = actual error rate / allowed error rate
+        burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf')
+
+        # Calculate time to exhaustion
+        if burn_rate > 0:
+            error_budget_hours = self.error_budget_minutes / 60
+            hours_to_exhaustion = error_budget_hours / burn_rate
+        else:
+            hours_to_exhaustion = float('inf')
+
+        # Determine severity
+        if burn_rate >= 14.4:  # 1 hour window, burns budget in 2 days
+            severity = "critical"
+        elif burn_rate >= 6:  # 6 hour window, burns budget in 5 days
+            severity = "warning"
+        elif burn_rate >= 1:
+            severity = "elevated"
+        else:
+            severity = "normal"
+
+        return {
+            "window_hours": window_hours,
+            "requests_in_window": requests_in_window,
+            "errors_in_window": errors_in_window,
+            "actual_error_rate": actual_error_rate,
+            "allowed_error_rate": allowed_error_rate,
+            "burn_rate": burn_rate,
+            "hours_to_exhaustion": hours_to_exhaustion,
+            "severity": severity
+        }
+
+    @staticmethod
+    def print_slo_table():
+        """Print table of common SLO targets and allowed downtime."""
+        if not tabulate:
+            print("Install tabulate for formatted output: pip install tabulate")
+            return
+
+        print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME")
+        print("="*60)
+
+        headers = ["SLO", "Year", "Month", "Week", "Day"]
+        rows = []
+
+        for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True):
+            row = [
+                f"{slo}%",
+                f"{downtimes['year']:.2f} days",
+                f"{downtimes['month']:.2f} days",
+                f"{downtimes['week']:.2f} days",
+                f"{downtimes['day']:.2f} days"
+            ]
+            rows.append(row)
+
+        print(tabulate(rows, headers=headers, tablefmt="grid"))
+
+
+def print_availability_results(results: Dict[str, Any]):
+    """Print availability SLO results."""
+    print("\n" + "="*60)
+    print("📊 AVAILABILITY SLO COMPLIANCE")
+    print("="*60)
+
+    if "error" in results:
+        print(f"\n❌ Error: {results['error']}")
+        return
+
+    status_emoji = "✅" if results['slo_met'] else "❌"
+    print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}")
+    print(f"   Target: {results['slo_target']}%")
+    print(f"   Actual: {results['success_rate']:.3f}%")
+    print(f"   Margin: {results['margin']:+.3f}%")
+
+    print(f"\n📈 Request Statistics:")
+    print(f"   Total Requests: {results['total_requests']:,}")
+    print(f"   Successful: {results['success_requests']:,}")
+    print(f"   Failed: {results['failed_requests']:,}")
+    print(f"   Error Rate: {results['error_rate']:.3f}%")
+
+    print(f"\n💰 Error Budget:")
+    budget_emoji = "✅" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else "❌"
+    print(f"   {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%")
+    print(f"   Consumed: {results['error_budget_consumed']:.1f}%")
+    print(f"   Allowed Failures: {results['error_budget_total']:.0f}")
+
+    print("\n" + "="*60)
+
+
+def print_burn_rate_results(results: Dict[str, Any]):
+    """Print burn rate results."""
+    print("\n" + "="*60)
+    print("🔥 ERROR BUDGET BURN RATE")
+    print("="*60)
+
+    if "error" in results:
+        print(f"\n❌ Error: {results['error']}")
+        return
+
+    severity_emoji = {
+        "critical": "🔴",
+        "warning": "🟡",
+        "elevated": "🟠",
+        "normal": "🟢"
+    }
+
+    print(f"\n{severity_emoji.get(results['severity'], '❓')} Severity: {results['severity'].upper()}")
+    print(f"   Burn Rate: {results['burn_rate']:.2f}x")
+    print(f"   Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)")
+
+    print(f"\n📊 Window Statistics:")
+    print(f"   Window: {results['window_hours']} hours")
+    print(f"   Requests: {results['requests_in_window']:,}")
+    print(f"   Errors: {results['errors_in_window']:,}")
+    print(f"   Actual Error Rate: {results['actual_error_rate']:.3f}%")
+    print(f"   Allowed Error Rate: {results['allowed_error_rate']:.3f}%")
+
+    print("\n" + "="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Calculate SLO compliance and error budgets",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Show SLO reference table
+  python3 slo_calculator.py --table
+
+  # Calculate availability SLO
+  python3 slo_calculator.py availability \\
+    --slo 99.9 \\
+    --total-requests 1000000 \\
+    --failed-requests 1500 \\
+    --period-days 30
+
+  # Calculate latency SLO
+  python3 slo_calculator.py latency \\
+    --slo 99.5 \\
+    --total-requests 500000 \\
+    --slow-requests 3000 \\
+    --period-days 7
+
+  # Calculate burn rate
+  python3 slo_calculator.py burn-rate \\
+    --slo 99.9 \\
+    --errors 50 \\
+    --requests 10000 \\
+    --window-hours 1
+        """
+    )
+
+    parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'],
+                       help='Calculation mode')
+    parser.add_argument('--table', action='store_true', help='Show SLO reference table')
+    parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)')
+    parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)')
+
+    # Availability SLO arguments
+    parser.add_argument('--total-requests', type=int, help='Total number of requests')
+    parser.add_argument('--failed-requests', type=int, help='Number of failed requests')
+
+    # Latency SLO arguments
+    parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold')
+
+    # Burn rate arguments
+    parser.add_argument('--errors', type=int, help='Number of errors in window')
+    parser.add_argument('--requests', type=int, help='Number of requests in window')
+    parser.add_argument('--window-hours', type=float, help='Window size in hours')
+
+    args = parser.parse_args()
+
+    # Show table if requested
+    if args.table:
+        SLOCalculator.print_slo_table()
+        return
+
+    if not args.mode:
+        parser.print_help()
+        return
+
+    if not args.slo:
+        print("❌ --slo required")
+        sys.exit(1)
+
+    calculator = SLOCalculator(args.slo, args.period_days)
+
+    if args.mode == 'availability':
+        if not args.total_requests or args.failed_requests is None:
+            print("❌ --total-requests and --failed-requests required")
+            sys.exit(1)
+
+        results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests)
+        print_availability_results(results)
+
+    elif args.mode == 'latency':
+        if not args.total_requests or args.slow_requests is None:
+            print("❌ --total-requests and --slow-requests required")
+            sys.exit(1)
+
+        results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests)
+        print_availability_results(results)  # Same format
+
+    elif args.mode == 'burn-rate':
+        if not all([args.errors is not None, args.requests, args.window_hours]):
+            print("❌ --errors, --requests, and --window-hours required")
+            sys.exit(1)
+
+        results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours)
+        print_burn_rate_results(results)
+
+
+if __name__ == "__main__":
+    main()