Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions

365
scripts/slo_calculator.py Normal file
View File

@@ -0,0 +1,365 @@
#!/usr/bin/env python3
"""
Calculate SLO compliance, error budgets, and burn rates.
Supports availability SLOs and latency SLOs.
"""
import argparse
import sys
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
try:
from tabulate import tabulate
except ImportError:
print("⚠️ Warning: 'tabulate' library not found. Install with: pip install tabulate")
tabulate = None
class SLOCalculator:
# SLO targets and allowed downtime per period
SLO_TARGETS = {
"90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1}, # days
"95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05},
"99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01},
"99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005},
"99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001},
"99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005},
"99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001},
}
def __init__(self, slo_target: float, period_days: int = 30):
"""
Initialize SLO calculator.
Args:
slo_target: SLO target percentage (e.g., 99.9)
period_days: Time period in days (default: 30)
"""
self.slo_target = slo_target
self.period_days = period_days
self.error_budget_minutes = self.calculate_error_budget_minutes()
def calculate_error_budget_minutes(self) -> float:
"""Calculate error budget in minutes for the period."""
total_minutes = self.period_days * 24 * 60
allowed_error_rate = (100 - self.slo_target) / 100
return total_minutes * allowed_error_rate
def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]:
"""
Calculate availability SLO compliance.
Args:
total_requests: Total number of requests
failed_requests: Number of failed requests
Returns:
Dict with SLO compliance metrics
"""
if total_requests == 0:
return {
"error": "No requests in the period",
"slo_met": False
}
success_rate = ((total_requests - failed_requests) / total_requests) * 100
error_rate = (failed_requests / total_requests) * 100
# Calculate error budget consumption
allowed_failures = total_requests * ((100 - self.slo_target) / 100)
error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf')
error_budget_remaining = max(0, 100 - error_budget_consumed)
# Determine if SLO is met
slo_met = success_rate >= self.slo_target
return {
"slo_target": self.slo_target,
"period_days": self.period_days,
"total_requests": total_requests,
"failed_requests": failed_requests,
"success_requests": total_requests - failed_requests,
"success_rate": success_rate,
"error_rate": error_rate,
"slo_met": slo_met,
"error_budget_total": allowed_failures,
"error_budget_consumed": error_budget_consumed,
"error_budget_remaining": error_budget_remaining,
"margin": success_rate - self.slo_target
}
def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]:
"""
Calculate latency SLO compliance.
Args:
total_requests: Total number of requests
requests_exceeding_threshold: Number of requests exceeding latency threshold
Returns:
Dict with SLO compliance metrics
"""
if total_requests == 0:
return {
"error": "No requests in the period",
"slo_met": False
}
within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100
# Calculate error budget consumption
allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100)
error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf')
error_budget_remaining = max(0, 100 - error_budget_consumed)
slo_met = within_threshold_rate >= self.slo_target
return {
"slo_target": self.slo_target,
"period_days": self.period_days,
"total_requests": total_requests,
"requests_exceeding_threshold": requests_exceeding_threshold,
"requests_within_threshold": total_requests - requests_exceeding_threshold,
"within_threshold_rate": within_threshold_rate,
"slo_met": slo_met,
"error_budget_total": allowed_slow_requests,
"error_budget_consumed": error_budget_consumed,
"error_budget_remaining": error_budget_remaining,
"margin": within_threshold_rate - self.slo_target
}
def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]:
"""
Calculate error budget burn rate.
Args:
errors_in_window: Number of errors in the time window
requests_in_window: Total requests in the time window
window_hours: Size of the time window in hours
Returns:
Dict with burn rate metrics
"""
if requests_in_window == 0:
return {"error": "No requests in window"}
# Calculate actual error rate in this window
actual_error_rate = (errors_in_window / requests_in_window) * 100
# Calculate allowed error rate for SLO
allowed_error_rate = 100 - self.slo_target
# Burn rate = actual error rate / allowed error rate
burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf')
# Calculate time to exhaustion
if burn_rate > 0:
error_budget_hours = self.error_budget_minutes / 60
hours_to_exhaustion = error_budget_hours / burn_rate
else:
hours_to_exhaustion = float('inf')
# Determine severity
if burn_rate >= 14.4: # 1 hour window, burns budget in 2 days
severity = "critical"
elif burn_rate >= 6: # 6 hour window, burns budget in 5 days
severity = "warning"
elif burn_rate >= 1:
severity = "elevated"
else:
severity = "normal"
return {
"window_hours": window_hours,
"requests_in_window": requests_in_window,
"errors_in_window": errors_in_window,
"actual_error_rate": actual_error_rate,
"allowed_error_rate": allowed_error_rate,
"burn_rate": burn_rate,
"hours_to_exhaustion": hours_to_exhaustion,
"severity": severity
}
@staticmethod
def print_slo_table():
"""Print table of common SLO targets and allowed downtime."""
if not tabulate:
print("Install tabulate for formatted output: pip install tabulate")
return
print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME")
print("="*60)
headers = ["SLO", "Year", "Month", "Week", "Day"]
rows = []
for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True):
row = [
f"{slo}%",
f"{downtimes['year']:.2f} days",
f"{downtimes['month']:.2f} days",
f"{downtimes['week']:.2f} days",
f"{downtimes['day']:.2f} days"
]
rows.append(row)
print(tabulate(rows, headers=headers, tablefmt="grid"))
def print_availability_results(results: Dict[str, Any]):
"""Print availability SLO results."""
print("\n" + "="*60)
print("📊 AVAILABILITY SLO COMPLIANCE")
print("="*60)
if "error" in results:
print(f"\n❌ Error: {results['error']}")
return
status_emoji = "" if results['slo_met'] else ""
print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}")
print(f" Target: {results['slo_target']}%")
print(f" Actual: {results['success_rate']:.3f}%")
print(f" Margin: {results['margin']:+.3f}%")
print(f"\n📈 Request Statistics:")
print(f" Total Requests: {results['total_requests']:,}")
print(f" Successful: {results['success_requests']:,}")
print(f" Failed: {results['failed_requests']:,}")
print(f" Error Rate: {results['error_rate']:.3f}%")
print(f"\n💰 Error Budget:")
budget_emoji = "" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else ""
print(f" {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%")
print(f" Consumed: {results['error_budget_consumed']:.1f}%")
print(f" Allowed Failures: {results['error_budget_total']:.0f}")
print("\n" + "="*60)
def print_burn_rate_results(results: Dict[str, Any]):
"""Print burn rate results."""
print("\n" + "="*60)
print("🔥 ERROR BUDGET BURN RATE")
print("="*60)
if "error" in results:
print(f"\n❌ Error: {results['error']}")
return
severity_emoji = {
"critical": "🔴",
"warning": "🟡",
"elevated": "🟠",
"normal": "🟢"
}
print(f"\n{severity_emoji.get(results['severity'], '')} Severity: {results['severity'].upper()}")
print(f" Burn Rate: {results['burn_rate']:.2f}x")
print(f" Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)")
print(f"\n📊 Window Statistics:")
print(f" Window: {results['window_hours']} hours")
print(f" Requests: {results['requests_in_window']:,}")
print(f" Errors: {results['errors_in_window']:,}")
print(f" Actual Error Rate: {results['actual_error_rate']:.3f}%")
print(f" Allowed Error Rate: {results['allowed_error_rate']:.3f}%")
print("\n" + "="*60)
def main():
parser = argparse.ArgumentParser(
description="Calculate SLO compliance and error budgets",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Show SLO reference table
python3 slo_calculator.py --table
# Calculate availability SLO
python3 slo_calculator.py availability \\
--slo 99.9 \\
--total-requests 1000000 \\
--failed-requests 1500 \\
--period-days 30
# Calculate latency SLO
python3 slo_calculator.py latency \\
--slo 99.5 \\
--total-requests 500000 \\
--slow-requests 3000 \\
--period-days 7
# Calculate burn rate
python3 slo_calculator.py burn-rate \\
--slo 99.9 \\
--errors 50 \\
--requests 10000 \\
--window-hours 1
"""
)
parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'],
help='Calculation mode')
parser.add_argument('--table', action='store_true', help='Show SLO reference table')
parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)')
parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)')
# Availability SLO arguments
parser.add_argument('--total-requests', type=int, help='Total number of requests')
parser.add_argument('--failed-requests', type=int, help='Number of failed requests')
# Latency SLO arguments
parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold')
# Burn rate arguments
parser.add_argument('--errors', type=int, help='Number of errors in window')
parser.add_argument('--requests', type=int, help='Number of requests in window')
parser.add_argument('--window-hours', type=float, help='Window size in hours')
args = parser.parse_args()
# Show table if requested
if args.table:
SLOCalculator.print_slo_table()
return
if not args.mode:
parser.print_help()
return
if not args.slo:
print("❌ --slo required")
sys.exit(1)
calculator = SLOCalculator(args.slo, args.period_days)
if args.mode == 'availability':
if not args.total_requests or args.failed_requests is None:
print("❌ --total-requests and --failed-requests required")
sys.exit(1)
results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests)
print_availability_results(results)
elif args.mode == 'latency':
if not args.total_requests or args.slow_requests is None:
print("❌ --total-requests and --slow-requests required")
sys.exit(1)
results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests)
print_availability_results(results) # Same format
elif args.mode == 'burn-rate':
if not all([args.errors is not None, args.requests, args.window_hours]):
print("❌ --errors, --requests, and --window-hours required")
sys.exit(1)
results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours)
print_burn_rate_results(results)
if __name__ == "__main__":
main()