Initial commit
This commit is contained in:
315
scripts/alert_quality_checker.py
Normal file
315
scripts/alert_quality_checker.py
Normal file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Audit Prometheus alert rules against best practices.
|
||||
Checks for: alert naming, severity labels, runbook links, expression quality.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'PyYAML' library not found. Install with: pip install pyyaml")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class AlertQualityChecker:
|
||||
def __init__(self):
|
||||
self.issues = []
|
||||
self.warnings = []
|
||||
self.recommendations = []
|
||||
|
||||
def check_alert_name(self, alert_name: str) -> List[str]:
|
||||
"""Check alert naming conventions."""
|
||||
issues = []
|
||||
|
||||
# Should be PascalCase or camelCase
|
||||
if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name):
|
||||
issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)")
|
||||
|
||||
# Should be descriptive
|
||||
if len(alert_name) < 5:
|
||||
issues.append(f"Alert name '{alert_name}' is too short, use descriptive names")
|
||||
|
||||
# Avoid generic names
|
||||
generic_names = ['Alert', 'Test', 'Warning', 'Error']
|
||||
if alert_name in generic_names:
|
||||
issues.append(f"Alert name '{alert_name}' is too generic")
|
||||
|
||||
return issues
|
||||
|
||||
def check_labels(self, alert: Dict[str, Any]) -> List[str]:
|
||||
"""Check required and recommended labels."""
|
||||
issues = []
|
||||
labels = alert.get('labels', {})
|
||||
|
||||
# Required labels
|
||||
if 'severity' not in labels:
|
||||
issues.append("Missing required 'severity' label (critical/warning/info)")
|
||||
elif labels['severity'] not in ['critical', 'warning', 'info']:
|
||||
issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info")
|
||||
|
||||
# Recommended labels
|
||||
if 'team' not in labels:
|
||||
self.recommendations.append("Consider adding 'team' label for routing")
|
||||
|
||||
if 'component' not in labels and 'service' not in labels:
|
||||
self.recommendations.append("Consider adding 'component' or 'service' label")
|
||||
|
||||
return issues
|
||||
|
||||
def check_annotations(self, alert: Dict[str, Any]) -> List[str]:
|
||||
"""Check annotations quality."""
|
||||
issues = []
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
# Required annotations
|
||||
if 'summary' not in annotations:
|
||||
issues.append("Missing 'summary' annotation")
|
||||
elif len(annotations['summary']) < 10:
|
||||
issues.append("Summary annotation is too short, provide clear description")
|
||||
|
||||
if 'description' not in annotations:
|
||||
issues.append("Missing 'description' annotation")
|
||||
|
||||
# Runbook
|
||||
if 'runbook_url' not in annotations and 'runbook' not in annotations:
|
||||
self.recommendations.append("Consider adding 'runbook_url' for incident response")
|
||||
|
||||
# Check for templating
|
||||
if 'summary' in annotations:
|
||||
if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']:
|
||||
self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})")
|
||||
|
||||
return issues
|
||||
|
||||
def check_expression(self, expr: str, alert_name: str) -> List[str]:
|
||||
"""Check PromQL expression quality."""
|
||||
issues = []
|
||||
|
||||
# Should have a threshold
|
||||
if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr:
|
||||
issues.append("Expression should include a comparison operator")
|
||||
|
||||
# Should use rate() for counters
|
||||
if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr:
|
||||
self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)")
|
||||
|
||||
# Avoid instant queries without aggregation
|
||||
if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']):
|
||||
if expr.count('{') > 1: # Multiple metrics without aggregation
|
||||
self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.")
|
||||
|
||||
# Check for proper time windows
|
||||
if '[' not in expr and 'rate(' in expr:
|
||||
issues.append("rate() requires a time window (e.g., rate(metric[5m]))")
|
||||
|
||||
return issues
|
||||
|
||||
def check_for_duration(self, rule: Dict[str, Any]) -> List[str]:
|
||||
"""Check for 'for' clause to prevent flapping."""
|
||||
issues = []
|
||||
severity = rule.get('labels', {}).get('severity', 'unknown')
|
||||
|
||||
if 'for' not in rule:
|
||||
if severity == 'critical':
|
||||
issues.append("Critical alerts should have 'for' clause to prevent flapping")
|
||||
else:
|
||||
self.warnings.append("Consider adding 'for' clause to prevent alert flapping")
|
||||
else:
|
||||
# Parse duration
|
||||
duration = rule['for']
|
||||
if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']):
|
||||
self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts")
|
||||
|
||||
return issues
|
||||
|
||||
def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Check a single alert rule."""
|
||||
alert_name = rule.get('alert', 'Unknown')
|
||||
issues = []
|
||||
|
||||
# Check alert name
|
||||
issues.extend(self.check_alert_name(alert_name))
|
||||
|
||||
# Check expression
|
||||
if 'expr' not in rule:
|
||||
issues.append("Missing 'expr' field")
|
||||
else:
|
||||
issues.extend(self.check_expression(rule['expr'], alert_name))
|
||||
|
||||
# Check labels
|
||||
issues.extend(self.check_labels(rule))
|
||||
|
||||
# Check annotations
|
||||
issues.extend(self.check_annotations(rule))
|
||||
|
||||
# Check for duration
|
||||
issues.extend(self.check_for_duration(rule))
|
||||
|
||||
return {
|
||||
"alert": alert_name,
|
||||
"issues": issues,
|
||||
"severity": rule.get('labels', {}).get('severity', 'unknown')
|
||||
}
|
||||
|
||||
def analyze_file(self, filepath: str) -> Dict[str, Any]:
|
||||
"""Analyze a Prometheus rules file."""
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
return {"error": "Empty or invalid YAML file"}
|
||||
|
||||
results = []
|
||||
groups = data.get('groups', [])
|
||||
|
||||
for group in groups:
|
||||
group_name = group.get('name', 'Unknown')
|
||||
rules = group.get('rules', [])
|
||||
|
||||
for rule in rules:
|
||||
# Only check alerting rules, not recording rules
|
||||
if 'alert' in rule:
|
||||
result = self.check_alert_rule(rule)
|
||||
result['group'] = group_name
|
||||
results.append(result)
|
||||
|
||||
return {
|
||||
"file": filepath,
|
||||
"groups": len(groups),
|
||||
"alerts_checked": len(results),
|
||||
"results": results
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to parse file: {e}"}
|
||||
|
||||
|
||||
def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker):
|
||||
"""Pretty print analysis results."""
|
||||
print("\n" + "="*60)
|
||||
print("🚨 ALERT QUALITY CHECK RESULTS")
|
||||
print("="*60)
|
||||
|
||||
if "error" in analysis:
|
||||
print(f"\n❌ Error: {analysis['error']}")
|
||||
return
|
||||
|
||||
print(f"\n📁 File: {analysis['file']}")
|
||||
print(f"📊 Groups: {analysis['groups']}")
|
||||
print(f"🔔 Alerts Checked: {analysis['alerts_checked']}")
|
||||
|
||||
# Count issues by severity
|
||||
critical_count = 0
|
||||
warning_count = 0
|
||||
|
||||
for result in analysis['results']:
|
||||
if result['issues']:
|
||||
critical_count += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📈 Summary:")
|
||||
print(f" ❌ Alerts with Issues: {critical_count}")
|
||||
print(f" ⚠️ Warnings: {len(checker.warnings)}")
|
||||
print(f" 💡 Recommendations: {len(checker.recommendations)}")
|
||||
|
||||
# Print detailed results
|
||||
if critical_count > 0:
|
||||
print(f"\n{'='*60}")
|
||||
print("❌ ALERTS WITH ISSUES:")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for result in analysis['results']:
|
||||
if result['issues']:
|
||||
print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})")
|
||||
print(f" Severity: {result['severity']}")
|
||||
print(" Issues:")
|
||||
for issue in result['issues']:
|
||||
print(f" • {issue}")
|
||||
|
||||
# Print warnings
|
||||
if checker.warnings:
|
||||
print(f"\n{'='*60}")
|
||||
print("⚠️ WARNINGS:")
|
||||
print(f"{'='*60}")
|
||||
for warning in set(checker.warnings): # Remove duplicates
|
||||
print(f"• {warning}")
|
||||
|
||||
# Print recommendations
|
||||
if checker.recommendations:
|
||||
print(f"\n{'='*60}")
|
||||
print("💡 RECOMMENDATIONS:")
|
||||
print(f"{'='*60}")
|
||||
for rec in list(set(checker.recommendations))[:10]: # Top 10 unique recommendations
|
||||
print(f"• {rec}")
|
||||
|
||||
# Overall score
|
||||
total_alerts = analysis['alerts_checked']
|
||||
if total_alerts > 0:
|
||||
quality_score = ((total_alerts - critical_count) / total_alerts) * 100
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Audit Prometheus alert rules for quality and best practices",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Check a single file
|
||||
python3 alert_quality_checker.py alerts.yml
|
||||
|
||||
# Check all YAML files in a directory
|
||||
python3 alert_quality_checker.py /path/to/prometheus/rules/
|
||||
|
||||
Best Practices Checked:
|
||||
✓ Alert naming conventions (PascalCase, descriptive)
|
||||
✓ Required labels (severity)
|
||||
✓ Required annotations (summary, description)
|
||||
✓ Runbook URL presence
|
||||
✓ PromQL expression quality
|
||||
✓ 'for' clause to prevent flapping
|
||||
✓ Template variable usage
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('path', help='Path to alert rules file or directory')
|
||||
parser.add_argument('--verbose', action='store_true', help='Show all recommendations')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checker = AlertQualityChecker()
|
||||
|
||||
# Check if path is file or directory
|
||||
path = Path(args.path)
|
||||
|
||||
if path.is_file():
|
||||
files = [str(path)]
|
||||
elif path.is_dir():
|
||||
files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')]
|
||||
else:
|
||||
print(f"❌ Path not found: {args.path}")
|
||||
sys.exit(1)
|
||||
|
||||
if not files:
|
||||
print(f"❌ No YAML files found in: {args.path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"🔍 Checking {len(files)} file(s)...")
|
||||
|
||||
for filepath in files:
|
||||
analysis = checker.analyze_file(filepath)
|
||||
print_results(analysis, checker)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
279
scripts/analyze_metrics.py
Normal file
279
scripts/analyze_metrics.py
Normal file
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze metrics from Prometheus or CloudWatch and detect anomalies.
|
||||
Supports: rate of change analysis, spike detection, trend analysis.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional
|
||||
import statistics
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import boto3
|
||||
except ImportError:
|
||||
boto3 = None
|
||||
|
||||
|
||||
class MetricAnalyzer:
|
||||
def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"):
|
||||
self.source = source
|
||||
self.endpoint = endpoint
|
||||
self.region = region
|
||||
if source == "cloudwatch" and boto3:
|
||||
self.cloudwatch = boto3.client('cloudwatch', region_name=region)
|
||||
elif source == "cloudwatch" and not boto3:
|
||||
print("⚠️ boto3 not installed. Install with: pip install boto3")
|
||||
sys.exit(1)
|
||||
|
||||
def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]:
|
||||
"""Query Prometheus for metric data."""
|
||||
if not self.endpoint:
|
||||
print("❌ Prometheus endpoint required")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Query range for last N hours
|
||||
end_time = datetime.now()
|
||||
start_time = end_time - timedelta(hours=hours)
|
||||
|
||||
params = {
|
||||
'query': query,
|
||||
'start': start_time.timestamp(),
|
||||
'end': end_time.timestamp(),
|
||||
'step': '5m' # 5-minute resolution
|
||||
}
|
||||
|
||||
response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data['status'] != 'success':
|
||||
print(f"❌ Prometheus query failed: {data}")
|
||||
return []
|
||||
|
||||
return data['data']['result']
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error querying Prometheus: {e}")
|
||||
return []
|
||||
|
||||
def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str],
|
||||
hours: int = 24, stat: str = "Average") -> List[Dict]:
|
||||
"""Query CloudWatch for metric data."""
|
||||
try:
|
||||
end_time = datetime.now()
|
||||
start_time = end_time - timedelta(hours=hours)
|
||||
|
||||
dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
|
||||
|
||||
response = self.cloudwatch.get_metric_statistics(
|
||||
Namespace=namespace,
|
||||
MetricName=metric_name,
|
||||
Dimensions=dimensions_list,
|
||||
StartTime=start_time,
|
||||
EndTime=end_time,
|
||||
Period=300, # 5-minute intervals
|
||||
Statistics=[stat]
|
||||
)
|
||||
|
||||
return sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error querying CloudWatch: {e}")
|
||||
return []
|
||||
|
||||
def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]:
|
||||
"""Detect anomalies using standard deviation method."""
|
||||
if len(values) < 10:
|
||||
return {
|
||||
"anomalies_detected": False,
|
||||
"message": "Insufficient data points for anomaly detection"
|
||||
}
|
||||
|
||||
mean = statistics.mean(values)
|
||||
stdev = statistics.stdev(values)
|
||||
threshold_upper = mean + (sensitivity * stdev)
|
||||
threshold_lower = mean - (sensitivity * stdev)
|
||||
|
||||
anomalies = []
|
||||
for i, value in enumerate(values):
|
||||
if value > threshold_upper or value < threshold_lower:
|
||||
anomalies.append({
|
||||
"index": i,
|
||||
"value": value,
|
||||
"deviation": abs(value - mean) / stdev if stdev > 0 else 0
|
||||
})
|
||||
|
||||
return {
|
||||
"anomalies_detected": len(anomalies) > 0,
|
||||
"count": len(anomalies),
|
||||
"anomalies": anomalies,
|
||||
"stats": {
|
||||
"mean": mean,
|
||||
"stdev": stdev,
|
||||
"threshold_upper": threshold_upper,
|
||||
"threshold_lower": threshold_lower,
|
||||
"total_points": len(values)
|
||||
}
|
||||
}
|
||||
|
||||
def analyze_trend(self, values: List[float]) -> Dict[str, Any]:
|
||||
"""Analyze trend using simple linear regression."""
|
||||
if len(values) < 2:
|
||||
return {"trend": "unknown", "message": "Insufficient data"}
|
||||
|
||||
n = len(values)
|
||||
x = list(range(n))
|
||||
x_mean = sum(x) / n
|
||||
y_mean = sum(values) / n
|
||||
|
||||
numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
|
||||
denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
|
||||
|
||||
if denominator == 0:
|
||||
return {"trend": "flat", "slope": 0}
|
||||
|
||||
slope = numerator / denominator
|
||||
|
||||
# Determine trend direction
|
||||
if abs(slope) < 0.01 * y_mean: # Less than 1% change per interval
|
||||
trend = "stable"
|
||||
elif slope > 0:
|
||||
trend = "increasing"
|
||||
else:
|
||||
trend = "decreasing"
|
||||
|
||||
return {
|
||||
"trend": trend,
|
||||
"slope": slope,
|
||||
"rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0
|
||||
}
|
||||
|
||||
|
||||
def print_results(results: Dict[str, Any]):
|
||||
"""Pretty print analysis results."""
|
||||
print("\n" + "="*60)
|
||||
print("📊 METRIC ANALYSIS RESULTS")
|
||||
print("="*60)
|
||||
|
||||
if "error" in results:
|
||||
print(f"\n❌ Error: {results['error']}")
|
||||
return
|
||||
|
||||
print(f"\n📈 Data Points: {results.get('data_points', 0)}")
|
||||
|
||||
# Trend analysis
|
||||
if "trend" in results:
|
||||
trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "❓")
|
||||
print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}")
|
||||
if "rate_of_change" in results["trend"]:
|
||||
print(f" Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval")
|
||||
|
||||
# Anomaly detection
|
||||
if "anomalies" in results:
|
||||
anomaly_data = results["anomalies"]
|
||||
if anomaly_data["anomalies_detected"]:
|
||||
print(f"\n⚠️ ANOMALIES DETECTED: {anomaly_data['count']}")
|
||||
print(f" Mean: {anomaly_data['stats']['mean']:.2f}")
|
||||
print(f" Std Dev: {anomaly_data['stats']['stdev']:.2f}")
|
||||
print(f" Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]")
|
||||
|
||||
print("\n Top Anomalies:")
|
||||
for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]:
|
||||
print(f" • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)")
|
||||
else:
|
||||
print("\n✅ No anomalies detected")
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze metrics from Prometheus or CloudWatch",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Prometheus: Analyze request rate
|
||||
python3 analyze_metrics.py prometheus \\
|
||||
--endpoint http://localhost:9090 \\
|
||||
--query 'rate(http_requests_total[5m])' \\
|
||||
--hours 24
|
||||
|
||||
# CloudWatch: Analyze CPU utilization
|
||||
python3 analyze_metrics.py cloudwatch \\
|
||||
--namespace AWS/EC2 \\
|
||||
--metric CPUUtilization \\
|
||||
--dimensions InstanceId=i-1234567890abcdef0 \\
|
||||
--hours 48
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('source', choices=['prometheus', 'cloudwatch'],
|
||||
help='Metric source')
|
||||
parser.add_argument('--endpoint', help='Prometheus endpoint URL')
|
||||
parser.add_argument('--query', help='PromQL query')
|
||||
parser.add_argument('--namespace', help='CloudWatch namespace')
|
||||
parser.add_argument('--metric', help='CloudWatch metric name')
|
||||
parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)')
|
||||
parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)')
|
||||
parser.add_argument('--sensitivity', type=float, default=2.0,
|
||||
help='Anomaly detection sensitivity (std deviations, default: 2.0)')
|
||||
parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = MetricAnalyzer(args.source, args.endpoint, args.region)
|
||||
|
||||
# Query metrics
|
||||
if args.source == 'prometheus':
|
||||
if not args.query:
|
||||
print("❌ --query required for Prometheus")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"🔍 Querying Prometheus: {args.query}")
|
||||
results = analyzer.query_prometheus(args.query, args.hours)
|
||||
|
||||
if not results:
|
||||
print("❌ No data returned")
|
||||
sys.exit(1)
|
||||
|
||||
# Extract values from first result series
|
||||
values = [float(v[1]) for v in results[0].get('values', [])]
|
||||
|
||||
elif args.source == 'cloudwatch':
|
||||
if not all([args.namespace, args.metric, args.dimensions]):
|
||||
print("❌ --namespace, --metric, and --dimensions required for CloudWatch")
|
||||
sys.exit(1)
|
||||
|
||||
dims = dict(item.split('=') for item in args.dimensions.split(','))
|
||||
|
||||
print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}")
|
||||
results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours)
|
||||
|
||||
if not results:
|
||||
print("❌ No data returned")
|
||||
sys.exit(1)
|
||||
|
||||
values = [point['Average'] for point in results]
|
||||
|
||||
# Analyze metrics
|
||||
analysis_results = {
|
||||
"data_points": len(values),
|
||||
"trend": analyzer.analyze_trend(values),
|
||||
"anomalies": analyzer.detect_anomalies(values, args.sensitivity)
|
||||
}
|
||||
|
||||
print_results(analysis_results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
395
scripts/dashboard_generator.py
Normal file
395
scripts/dashboard_generator.py
Normal file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate Grafana dashboards from templates.
|
||||
Supports: web applications, Kubernetes, databases, Redis, and custom metrics.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
from typing import Dict, List, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class DashboardGenerator:
|
||||
def __init__(self, title: str, datasource: str = "Prometheus"):
|
||||
self.title = title
|
||||
self.datasource = datasource
|
||||
self.dashboard = self._create_base_dashboard()
|
||||
self.panel_id = 1
|
||||
self.row_y = 0
|
||||
|
||||
def _create_base_dashboard(self) -> Dict[str, Any]:
|
||||
"""Create base dashboard structure."""
|
||||
return {
|
||||
"dashboard": {
|
||||
"title": self.title,
|
||||
"tags": [],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 16,
|
||||
"version": 0,
|
||||
"refresh": "30s",
|
||||
"panels": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
}
|
||||
},
|
||||
"overwrite": True
|
||||
}
|
||||
|
||||
def add_variable(self, name: str, label: str, query: str):
|
||||
"""Add a template variable."""
|
||||
variable = {
|
||||
"name": name,
|
||||
"label": label,
|
||||
"type": "query",
|
||||
"datasource": self.datasource,
|
||||
"query": query,
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"multi": False,
|
||||
"includeAll": False
|
||||
}
|
||||
self.dashboard["dashboard"]["templating"]["list"].append(variable)
|
||||
|
||||
def add_row(self, title: str):
|
||||
"""Add a row panel."""
|
||||
panel = {
|
||||
"id": self.panel_id,
|
||||
"type": "row",
|
||||
"title": title,
|
||||
"collapsed": False,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": self.row_y}
|
||||
}
|
||||
self.dashboard["dashboard"]["panels"].append(panel)
|
||||
self.panel_id += 1
|
||||
self.row_y += 1
|
||||
|
||||
def add_graph(self, title: str, targets: List[Dict[str, str]], unit: str = "short",
|
||||
width: int = 12, height: int = 8):
|
||||
"""Add a graph panel."""
|
||||
panel = {
|
||||
"id": self.panel_id,
|
||||
"type": "graph",
|
||||
"title": title,
|
||||
"datasource": self.datasource,
|
||||
"targets": [
|
||||
{
|
||||
"expr": target["query"],
|
||||
"legendFormat": target.get("legend", ""),
|
||||
"refId": chr(65 + i) # A, B, C, etc.
|
||||
}
|
||||
for i, target in enumerate(targets)
|
||||
],
|
||||
"gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y},
|
||||
"yaxes": [
|
||||
{"format": unit, "label": None, "show": True},
|
||||
{"format": "short", "label": None, "show": True}
|
||||
],
|
||||
"lines": True,
|
||||
"fill": 1,
|
||||
"linewidth": 2,
|
||||
"legend": {
|
||||
"show": True,
|
||||
"alignAsTable": True,
|
||||
"avg": True,
|
||||
"current": True,
|
||||
"max": True,
|
||||
"min": False,
|
||||
"total": False,
|
||||
"values": True
|
||||
}
|
||||
}
|
||||
self.dashboard["dashboard"]["panels"].append(panel)
|
||||
self.panel_id += 1
|
||||
self.row_y += height
|
||||
|
||||
def add_stat(self, title: str, query: str, unit: str = "short",
|
||||
width: int = 6, height: int = 4):
|
||||
"""Add a stat panel (single value)."""
|
||||
panel = {
|
||||
"id": self.panel_id,
|
||||
"type": "stat",
|
||||
"title": title,
|
||||
"datasource": self.datasource,
|
||||
"targets": [
|
||||
{
|
||||
"expr": query,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y},
|
||||
"options": {
|
||||
"graphMode": "area",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"values": False,
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": unit,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": None, "color": "green"},
|
||||
{"value": 80, "color": "red"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.dashboard["dashboard"]["panels"].append(panel)
|
||||
self.panel_id += 1
|
||||
|
||||
def generate_webapp_dashboard(self, service: str):
|
||||
"""Generate dashboard for web application."""
|
||||
self.add_variable("service", "Service", f"label_values({service}_http_requests_total, service)")
|
||||
|
||||
# Request metrics
|
||||
self.add_row("Request Metrics")
|
||||
|
||||
self.add_graph(
|
||||
"Request Rate",
|
||||
[{"query": f'sum(rate({service}_http_requests_total[5m])) by (status)', "legend": "{{status}}"}],
|
||||
unit="reqps",
|
||||
width=12
|
||||
)
|
||||
|
||||
self.add_graph(
|
||||
"Request Latency (p50, p95, p99)",
|
||||
[
|
||||
{"query": f'histogram_quantile(0.50, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p50"},
|
||||
{"query": f'histogram_quantile(0.95, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p95"},
|
||||
{"query": f'histogram_quantile(0.99, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p99"}
|
||||
],
|
||||
unit="s",
|
||||
width=12
|
||||
)
|
||||
|
||||
# Error rate
|
||||
self.add_row("Errors")
|
||||
|
||||
self.add_graph(
|
||||
"Error Rate (%)",
|
||||
[{"query": f'sum(rate({service}_http_requests_total{{status=~"5.."}}[5m])) / sum(rate({service}_http_requests_total[5m])) * 100', "legend": "Error Rate"}],
|
||||
unit="percent",
|
||||
width=12
|
||||
)
|
||||
|
||||
# Resource usage
|
||||
self.add_row("Resource Usage")
|
||||
|
||||
self.add_graph(
|
||||
"CPU Usage",
|
||||
[{"query": f'sum(rate(process_cpu_seconds_total{{job="{service}"}}[5m])) * 100', "legend": "CPU %"}],
|
||||
unit="percent",
|
||||
width=12
|
||||
)
|
||||
|
||||
self.add_graph(
|
||||
"Memory Usage",
|
||||
[{"query": f'process_resident_memory_bytes{{job="{service}"}}', "legend": "Memory"}],
|
||||
unit="bytes",
|
||||
width=12
|
||||
)
|
||||
|
||||
def generate_kubernetes_dashboard(self, namespace: str):
|
||||
"""Generate dashboard for Kubernetes cluster."""
|
||||
self.add_variable("namespace", "Namespace", f"label_values(kube_pod_info, namespace)")
|
||||
|
||||
# Cluster overview
|
||||
self.add_row("Cluster Overview")
|
||||
|
||||
self.add_stat("Total Pods", f'count(kube_pod_info{{namespace="{namespace}"}})', width=6)
|
||||
self.add_stat("Running Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Running"}})', width=6)
|
||||
self.add_stat("Pending Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Pending"}})', width=6)
|
||||
self.add_stat("Failed Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Failed"}})', width=6)
|
||||
|
||||
# Resource usage
|
||||
self.add_row("Resource Usage")
|
||||
|
||||
self.add_graph(
|
||||
"CPU Usage by Pod",
|
||||
[{"query": f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "{{pod}}"}],
|
||||
unit="percent",
|
||||
width=12
|
||||
)
|
||||
|
||||
self.add_graph(
|
||||
"Memory Usage by Pod",
|
||||
[{"query": f'sum(container_memory_usage_bytes{{namespace="{namespace}"}}) by (pod)', "legend": "{{pod}}"}],
|
||||
unit="bytes",
|
||||
width=12
|
||||
)
|
||||
|
||||
# Network
|
||||
self.add_row("Network")
|
||||
|
||||
self.add_graph(
|
||||
"Network I/O",
|
||||
[
|
||||
{"query": f'sum(rate(container_network_receive_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Receive - {{pod}}"},
|
||||
{"query": f'sum(rate(container_network_transmit_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Transmit - {{pod}}"}
|
||||
],
|
||||
unit="Bps",
|
||||
width=12
|
||||
)
|
||||
|
||||
def generate_database_dashboard(self, db_type: str, instance: str):
|
||||
"""Generate dashboard for database (postgres/mysql)."""
|
||||
if db_type == "postgres":
|
||||
self._generate_postgres_dashboard(instance)
|
||||
elif db_type == "mysql":
|
||||
self._generate_mysql_dashboard(instance)
|
||||
|
||||
def _generate_postgres_dashboard(self, instance: str):
|
||||
"""Generate PostgreSQL dashboard."""
|
||||
self.add_row("PostgreSQL Metrics")
|
||||
|
||||
self.add_graph(
|
||||
"Connections",
|
||||
[
|
||||
{"query": f'pg_stat_database_numbackends{{instance="{instance}"}}', "legend": "{{datname}}"}
|
||||
],
|
||||
unit="short",
|
||||
width=12
|
||||
)
|
||||
|
||||
self.add_graph(
|
||||
"Transactions per Second",
|
||||
[
|
||||
{"query": f'rate(pg_stat_database_xact_commit{{instance="{instance}"}}[5m])', "legend": "Commits"},
|
||||
{"query": f'rate(pg_stat_database_xact_rollback{{instance="{instance}"}}[5m])', "legend": "Rollbacks"}
|
||||
],
|
||||
unit="tps",
|
||||
width=12
|
||||
)
|
||||
|
||||
self.add_graph(
|
||||
"Query Duration (p95)",
|
||||
[
|
||||
{"query": f'histogram_quantile(0.95, rate(pg_stat_statements_total_time_bucket{{instance="{instance}"}}[5m]))', "legend": "p95"}
|
||||
],
|
||||
unit="ms",
|
||||
width=12
|
||||
)
|
||||
|
||||
def _generate_mysql_dashboard(self, instance: str):
|
||||
"""Generate MySQL dashboard."""
|
||||
self.add_row("MySQL Metrics")
|
||||
|
||||
self.add_graph(
|
||||
"Connections",
|
||||
[
|
||||
{"query": f'mysql_global_status_threads_connected{{instance="{instance}"}}', "legend": "Connected"},
|
||||
{"query": f'mysql_global_status_threads_running{{instance="{instance}"}}', "legend": "Running"}
|
||||
],
|
||||
unit="short",
|
||||
width=12
|
||||
)
|
||||
|
||||
self.add_graph(
|
||||
"Queries per Second",
|
||||
[
|
||||
{"query": f'rate(mysql_global_status_queries{{instance="{instance}"}}[5m])', "legend": "Queries"}
|
||||
],
|
||||
unit="qps",
|
||||
width=12
|
||||
)
|
||||
|
||||
def save(self, output_file: str):
|
||||
"""Save dashboard to file."""
|
||||
try:
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(self.dashboard, f, indent=2)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Error saving dashboard: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate Grafana dashboards from templates",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Web application dashboard
|
||||
python3 dashboard_generator.py webapp \\
|
||||
--title "My API Dashboard" \\
|
||||
--service my_api \\
|
||||
--output dashboard.json
|
||||
|
||||
# Kubernetes dashboard
|
||||
python3 dashboard_generator.py kubernetes \\
|
||||
--title "K8s Namespace" \\
|
||||
--namespace production \\
|
||||
--output k8s-dashboard.json
|
||||
|
||||
# Database dashboard
|
||||
python3 dashboard_generator.py database \\
|
||||
--title "PostgreSQL" \\
|
||||
--db-type postgres \\
|
||||
--instance db.example.com:5432 \\
|
||||
--output db-dashboard.json
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('type', choices=['webapp', 'kubernetes', 'database'],
|
||||
help='Dashboard type')
|
||||
parser.add_argument('--title', required=True, help='Dashboard title')
|
||||
parser.add_argument('--output', required=True, help='Output file path')
|
||||
parser.add_argument('--datasource', default='Prometheus', help='Data source name')
|
||||
|
||||
# Web app specific
|
||||
parser.add_argument('--service', help='Service name (for webapp)')
|
||||
|
||||
# Kubernetes specific
|
||||
parser.add_argument('--namespace', help='Kubernetes namespace')
|
||||
|
||||
# Database specific
|
||||
parser.add_argument('--db-type', choices=['postgres', 'mysql'], help='Database type')
|
||||
parser.add_argument('--instance', help='Database instance')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"🎨 Generating {args.type} dashboard: {args.title}")
|
||||
|
||||
generator = DashboardGenerator(args.title, args.datasource)
|
||||
|
||||
if args.type == 'webapp':
|
||||
if not args.service:
|
||||
print("❌ --service required for webapp dashboard")
|
||||
sys.exit(1)
|
||||
generator.generate_webapp_dashboard(args.service)
|
||||
|
||||
elif args.type == 'kubernetes':
|
||||
if not args.namespace:
|
||||
print("❌ --namespace required for kubernetes dashboard")
|
||||
sys.exit(1)
|
||||
generator.generate_kubernetes_dashboard(args.namespace)
|
||||
|
||||
elif args.type == 'database':
|
||||
if not args.db_type or not args.instance:
|
||||
print("❌ --db-type and --instance required for database dashboard")
|
||||
sys.exit(1)
|
||||
generator.generate_database_dashboard(args.db_type, args.instance)
|
||||
|
||||
if generator.save(args.output):
|
||||
print(f"✅ Dashboard saved to: {args.output}")
|
||||
print(f"\n📝 Import to Grafana:")
|
||||
print(f" 1. Go to Grafana → Dashboards → Import")
|
||||
print(f" 2. Upload {args.output}")
|
||||
print(f" 3. Select datasource and save")
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
477
scripts/datadog_cost_analyzer.py
Normal file
477
scripts/datadog_cost_analyzer.py
Normal file
@@ -0,0 +1,477 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze Datadog usage and identify cost optimization opportunities.
|
||||
Helps find waste in custom metrics, logs, APM, and infrastructure monitoring.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
from tabulate import tabulate
|
||||
except ImportError:
|
||||
tabulate = None
|
||||
|
||||
|
||||
class DatadogCostAnalyzer:
|
||||
# Pricing (as of 2024-2025)
|
||||
PRICING = {
|
||||
'infrastructure_pro': 15, # per host per month
|
||||
'infrastructure_enterprise': 23,
|
||||
'custom_metric': 0.01, # per metric per month (first 100 free per host)
|
||||
'log_ingestion': 0.10, # per GB ingested per month
|
||||
'apm_host': 31, # APM Pro per host per month
|
||||
'apm_span': 1.70, # per million indexed spans
|
||||
}
|
||||
|
||||
def __init__(self, api_key: str, app_key: str, site: str = "datadoghq.com"):
|
||||
self.api_key = api_key
|
||||
self.app_key = app_key
|
||||
self.site = site
|
||||
self.base_url = f"https://api.{site}"
|
||||
self.headers = {
|
||||
'DD-API-KEY': api_key,
|
||||
'DD-APPLICATION-KEY': app_key,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
|
||||
"""Make API request to Datadog."""
|
||||
try:
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
response = requests.get(url, headers=self.headers, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"❌ API Error: {e}")
|
||||
return {}
|
||||
|
||||
def get_usage_metrics(self, start_date: str, end_date: str) -> Dict[str, Any]:
|
||||
"""Get usage metrics for specified date range."""
|
||||
endpoint = "/api/v1/usage/summary"
|
||||
params = {
|
||||
'start_month': start_date,
|
||||
'end_month': end_date,
|
||||
'include_org_details': 'true'
|
||||
}
|
||||
|
||||
data = self._make_request(endpoint, params)
|
||||
return data.get('usage', [])
|
||||
|
||||
def get_custom_metrics(self) -> Dict[str, Any]:
|
||||
"""Get custom metrics usage and identify high-cardinality metrics."""
|
||||
endpoint = "/api/v1/usage/timeseries"
|
||||
|
||||
# Get last 30 days
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=30)
|
||||
|
||||
params = {
|
||||
'start_hr': int(start_date.timestamp()),
|
||||
'end_hr': int(end_date.timestamp())
|
||||
}
|
||||
|
||||
data = self._make_request(endpoint, params)
|
||||
|
||||
if not data:
|
||||
return {'metrics': [], 'total_count': 0}
|
||||
|
||||
# Extract custom metrics info
|
||||
usage_data = data.get('usage', [])
|
||||
|
||||
metrics_summary = {
|
||||
'total_custom_metrics': 0,
|
||||
'avg_custom_metrics': 0,
|
||||
'billable_metrics': 0
|
||||
}
|
||||
|
||||
for day in usage_data:
|
||||
if 'timeseries' in day:
|
||||
for ts in day['timeseries']:
|
||||
if ts.get('metric_category') == 'custom':
|
||||
metrics_summary['total_custom_metrics'] = max(
|
||||
metrics_summary['total_custom_metrics'],
|
||||
ts.get('num_custom_timeseries', 0)
|
||||
)
|
||||
|
||||
# Calculate billable (first 100 free)
|
||||
metrics_summary['billable_metrics'] = max(0, metrics_summary['total_custom_metrics'] - 100)
|
||||
|
||||
return metrics_summary
|
||||
|
||||
def get_infrastructure_hosts(self) -> Dict[str, Any]:
|
||||
"""Get infrastructure host count and breakdown."""
|
||||
endpoint = "/api/v1/usage/hosts"
|
||||
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=30)
|
||||
|
||||
params = {
|
||||
'start_hr': int(start_date.timestamp()),
|
||||
'end_hr': int(end_date.timestamp())
|
||||
}
|
||||
|
||||
data = self._make_request(endpoint, params)
|
||||
|
||||
if not data:
|
||||
return {'total_hosts': 0}
|
||||
|
||||
usage = data.get('usage', [])
|
||||
|
||||
host_summary = {
|
||||
'total_hosts': 0,
|
||||
'agent_hosts': 0,
|
||||
'aws_hosts': 0,
|
||||
'azure_hosts': 0,
|
||||
'gcp_hosts': 0,
|
||||
'container_count': 0
|
||||
}
|
||||
|
||||
for day in usage:
|
||||
host_summary['total_hosts'] = max(host_summary['total_hosts'], day.get('host_count', 0))
|
||||
host_summary['agent_hosts'] = max(host_summary['agent_hosts'], day.get('agent_host_count', 0))
|
||||
host_summary['aws_hosts'] = max(host_summary['aws_hosts'], day.get('aws_host_count', 0))
|
||||
host_summary['azure_hosts'] = max(host_summary['azure_hosts'], day.get('azure_host_count', 0))
|
||||
host_summary['gcp_hosts'] = max(host_summary['gcp_hosts'], day.get('gcp_host_count', 0))
|
||||
host_summary['container_count'] = max(host_summary['container_count'], day.get('container_count', 0))
|
||||
|
||||
return host_summary
|
||||
|
||||
def get_log_usage(self) -> Dict[str, Any]:
|
||||
"""Get log ingestion and retention usage."""
|
||||
endpoint = "/api/v1/usage/logs"
|
||||
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=30)
|
||||
|
||||
params = {
|
||||
'start_hr': int(start_date.timestamp()),
|
||||
'end_hr': int(end_date.timestamp())
|
||||
}
|
||||
|
||||
data = self._make_request(endpoint, params)
|
||||
|
||||
if not data:
|
||||
return {'total_gb': 0, 'daily_avg_gb': 0}
|
||||
|
||||
usage = data.get('usage', [])
|
||||
|
||||
total_ingested = 0
|
||||
days_count = len(usage)
|
||||
|
||||
for day in usage:
|
||||
total_ingested += day.get('ingested_events_bytes', 0)
|
||||
|
||||
total_gb = total_ingested / (1024**3) # Convert to GB
|
||||
daily_avg_gb = total_gb / max(days_count, 1)
|
||||
|
||||
return {
|
||||
'total_gb': total_gb,
|
||||
'daily_avg_gb': daily_avg_gb,
|
||||
'monthly_projected_gb': daily_avg_gb * 30
|
||||
}
|
||||
|
||||
def get_unused_monitors(self) -> List[Dict[str, Any]]:
|
||||
"""Find monitors that haven't alerted in 30+ days."""
|
||||
endpoint = "/api/v1/monitor"
|
||||
|
||||
data = self._make_request(endpoint)
|
||||
|
||||
if not data:
|
||||
return []
|
||||
|
||||
monitors = data if isinstance(data, list) else []
|
||||
|
||||
unused = []
|
||||
now = datetime.now()
|
||||
|
||||
for monitor in monitors:
|
||||
# Check if monitor has triggered recently
|
||||
overall_state = monitor.get('overall_state')
|
||||
modified = monitor.get('modified', '')
|
||||
|
||||
# If monitor has been in OK state and not modified in 30+ days
|
||||
try:
|
||||
if modified:
|
||||
mod_date = datetime.fromisoformat(modified.replace('Z', '+00:00'))
|
||||
days_since_modified = (now - mod_date.replace(tzinfo=None)).days
|
||||
|
||||
if days_since_modified > 30 and overall_state in ['OK', 'No Data']:
|
||||
unused.append({
|
||||
'name': monitor.get('name', 'Unknown'),
|
||||
'id': monitor.get('id'),
|
||||
'days_since_modified': days_since_modified,
|
||||
'state': overall_state
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
return unused
|
||||
|
||||
def calculate_costs(self, usage_data: Dict[str, Any]) -> Dict[str, float]:
|
||||
"""Calculate estimated monthly costs."""
|
||||
costs = {
|
||||
'infrastructure': 0,
|
||||
'custom_metrics': 0,
|
||||
'logs': 0,
|
||||
'apm': 0,
|
||||
'total': 0
|
||||
}
|
||||
|
||||
# Infrastructure (assuming Pro tier)
|
||||
if 'hosts' in usage_data:
|
||||
costs['infrastructure'] = usage_data['hosts'].get('total_hosts', 0) * self.PRICING['infrastructure_pro']
|
||||
|
||||
# Custom metrics
|
||||
if 'custom_metrics' in usage_data:
|
||||
billable = usage_data['custom_metrics'].get('billable_metrics', 0)
|
||||
costs['custom_metrics'] = billable * self.PRICING['custom_metric']
|
||||
|
||||
# Logs
|
||||
if 'logs' in usage_data:
|
||||
monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0)
|
||||
costs['logs'] = monthly_gb * self.PRICING['log_ingestion']
|
||||
|
||||
costs['total'] = sum(costs.values())
|
||||
|
||||
return costs
|
||||
|
||||
def get_recommendations(self, usage_data: Dict[str, Any]) -> List[str]:
|
||||
"""Generate cost optimization recommendations."""
|
||||
recommendations = []
|
||||
|
||||
# Custom metrics recommendations
|
||||
if 'custom_metrics' in usage_data:
|
||||
billable = usage_data['custom_metrics'].get('billable_metrics', 0)
|
||||
if billable > 500:
|
||||
savings = (billable * 0.3) * self.PRICING['custom_metric'] # Assume 30% reduction possible
|
||||
recommendations.append({
|
||||
'category': 'Custom Metrics',
|
||||
'issue': f'High custom metric count: {billable:,} billable metrics',
|
||||
'action': 'Review metric tags for high cardinality, consider aggregating or dropping unused metrics',
|
||||
'potential_savings': f'${savings:.2f}/month'
|
||||
})
|
||||
|
||||
# Container vs VM recommendations
|
||||
if 'hosts' in usage_data:
|
||||
hosts = usage_data['hosts'].get('total_hosts', 0)
|
||||
containers = usage_data['hosts'].get('container_count', 0)
|
||||
|
||||
if containers > hosts * 10: # Many containers per host
|
||||
savings = hosts * 0.2 * self.PRICING['infrastructure_pro']
|
||||
recommendations.append({
|
||||
'category': 'Infrastructure',
|
||||
'issue': f'{containers:,} containers running on {hosts} hosts',
|
||||
'action': 'Consider using container monitoring instead of host-based (can be 50-70% cheaper)',
|
||||
'potential_savings': f'${savings:.2f}/month'
|
||||
})
|
||||
|
||||
# Unused monitors
|
||||
if 'unused_monitors' in usage_data:
|
||||
count = len(usage_data['unused_monitors'])
|
||||
if count > 10:
|
||||
recommendations.append({
|
||||
'category': 'Monitors',
|
||||
'issue': f'{count} monitors unused for 30+ days',
|
||||
'action': 'Delete or disable unused monitors to reduce noise and improve performance',
|
||||
'potential_savings': 'Operational efficiency'
|
||||
})
|
||||
|
||||
# Log volume recommendations
|
||||
if 'logs' in usage_data:
|
||||
monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0)
|
||||
if monthly_gb > 100:
|
||||
savings = (monthly_gb * 0.4) * self.PRICING['log_ingestion'] # 40% reduction
|
||||
recommendations.append({
|
||||
'category': 'Logs',
|
||||
'issue': f'High log volume: {monthly_gb:.1f} GB/month projected',
|
||||
'action': 'Review log sources, implement sampling for debug logs, exclude health checks',
|
||||
'potential_savings': f'${savings:.2f}/month'
|
||||
})
|
||||
|
||||
# Migration recommendation if costs are high
|
||||
costs = self.calculate_costs(usage_data)
|
||||
if costs['total'] > 5000:
|
||||
oss_cost = usage_data['hosts'].get('total_hosts', 0) * 15 # Rough estimate for self-hosted
|
||||
savings = costs['total'] - oss_cost
|
||||
recommendations.append({
|
||||
'category': 'Strategic',
|
||||
'issue': f'Total monthly cost: ${costs["total"]:.2f}',
|
||||
'action': 'Consider migrating to open-source stack (Prometheus + Grafana + Loki)',
|
||||
'potential_savings': f'${savings:.2f}/month (~{(savings/costs["total"]*100):.0f}% reduction)'
|
||||
})
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
def print_usage_summary(usage_data: Dict[str, Any]):
|
||||
"""Print usage summary."""
|
||||
print("\n" + "="*70)
|
||||
print("📊 DATADOG USAGE SUMMARY")
|
||||
print("="*70)
|
||||
|
||||
# Infrastructure
|
||||
if 'hosts' in usage_data:
|
||||
hosts = usage_data['hosts']
|
||||
print(f"\n🖥️ Infrastructure:")
|
||||
print(f" Total Hosts: {hosts.get('total_hosts', 0):,}")
|
||||
print(f" Agent Hosts: {hosts.get('agent_hosts', 0):,}")
|
||||
print(f" AWS Hosts: {hosts.get('aws_hosts', 0):,}")
|
||||
print(f" Azure Hosts: {hosts.get('azure_hosts', 0):,}")
|
||||
print(f" GCP Hosts: {hosts.get('gcp_hosts', 0):,}")
|
||||
print(f" Containers: {hosts.get('container_count', 0):,}")
|
||||
|
||||
# Custom Metrics
|
||||
if 'custom_metrics' in usage_data:
|
||||
metrics = usage_data['custom_metrics']
|
||||
print(f"\n📈 Custom Metrics:")
|
||||
print(f" Total: {metrics.get('total_custom_metrics', 0):,}")
|
||||
print(f" Billable: {metrics.get('billable_metrics', 0):,} (first 100 free)")
|
||||
|
||||
# Logs
|
||||
if 'logs' in usage_data:
|
||||
logs = usage_data['logs']
|
||||
print(f"\n📝 Logs:")
|
||||
print(f" Daily Average: {logs.get('daily_avg_gb', 0):.2f} GB")
|
||||
print(f" Monthly Projected: {logs.get('monthly_projected_gb', 0):.2f} GB")
|
||||
|
||||
# Unused Monitors
|
||||
if 'unused_monitors' in usage_data:
|
||||
print(f"\n🔔 Unused Monitors:")
|
||||
print(f" Count: {len(usage_data['unused_monitors'])}")
|
||||
|
||||
|
||||
def print_cost_breakdown(costs: Dict[str, float]):
|
||||
"""Print cost breakdown."""
|
||||
print("\n" + "="*70)
|
||||
print("💰 ESTIMATED MONTHLY COSTS")
|
||||
print("="*70)
|
||||
|
||||
print(f"\n Infrastructure Monitoring: ${costs['infrastructure']:,.2f}")
|
||||
print(f" Custom Metrics: ${costs['custom_metrics']:,.2f}")
|
||||
print(f" Log Management: ${costs['logs']:,.2f}")
|
||||
print(f" APM: ${costs['apm']:,.2f}")
|
||||
print(f" " + "-"*40)
|
||||
print(f" TOTAL: ${costs['total']:,.2f}/month")
|
||||
print(f" ${costs['total']*12:,.2f}/year")
|
||||
|
||||
|
||||
def print_recommendations(recommendations: List[Dict]):
|
||||
"""Print recommendations."""
|
||||
print("\n" + "="*70)
|
||||
print("💡 COST OPTIMIZATION RECOMMENDATIONS")
|
||||
print("="*70)
|
||||
|
||||
total_savings = 0
|
||||
|
||||
for i, rec in enumerate(recommendations, 1):
|
||||
print(f"\n{i}. {rec['category']}")
|
||||
print(f" Issue: {rec['issue']}")
|
||||
print(f" Action: {rec['action']}")
|
||||
print(f" Potential Savings: {rec['potential_savings']}")
|
||||
|
||||
# Extract savings amount if it's a dollar value
|
||||
if '$' in rec['potential_savings']:
|
||||
try:
|
||||
amount = float(rec['potential_savings'].replace('$', '').replace('/month', '').replace(',', ''))
|
||||
total_savings += amount
|
||||
except:
|
||||
pass
|
||||
|
||||
if total_savings > 0:
|
||||
print(f"\n{'='*70}")
|
||||
print(f"💵 Total Potential Monthly Savings: ${total_savings:,.2f}")
|
||||
print(f"💵 Total Potential Annual Savings: ${total_savings*12:,.2f}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze Datadog usage and identify cost optimization opportunities",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Analyze current usage
|
||||
python3 datadog_cost_analyzer.py \\
|
||||
--api-key DD_API_KEY \\
|
||||
--app-key DD_APP_KEY
|
||||
|
||||
# Use environment variables
|
||||
export DD_API_KEY=your_api_key
|
||||
export DD_APP_KEY=your_app_key
|
||||
python3 datadog_cost_analyzer.py
|
||||
|
||||
# Specify site (for EU)
|
||||
python3 datadog_cost_analyzer.py --site datadoghq.eu
|
||||
|
||||
Required Datadog Permissions:
|
||||
- usage_read
|
||||
- monitors_read
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--api-key',
|
||||
default=os.environ.get('DD_API_KEY'),
|
||||
help='Datadog API key (or set DD_API_KEY env var)')
|
||||
parser.add_argument('--app-key',
|
||||
default=os.environ.get('DD_APP_KEY'),
|
||||
help='Datadog Application key (or set DD_APP_KEY env var)')
|
||||
parser.add_argument('--site',
|
||||
default='datadoghq.com',
|
||||
help='Datadog site (default: datadoghq.com, EU: datadoghq.eu)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.api_key or not args.app_key:
|
||||
print("❌ Error: API key and Application key required")
|
||||
print(" Set via --api-key and --app-key flags or DD_API_KEY and DD_APP_KEY env vars")
|
||||
sys.exit(1)
|
||||
|
||||
print("🔍 Analyzing Datadog usage...")
|
||||
print(" This may take 30-60 seconds...\n")
|
||||
|
||||
analyzer = DatadogCostAnalyzer(args.api_key, args.app_key, args.site)
|
||||
|
||||
# Gather usage data
|
||||
usage_data = {}
|
||||
|
||||
print(" ⏳ Fetching infrastructure usage...")
|
||||
usage_data['hosts'] = analyzer.get_infrastructure_hosts()
|
||||
|
||||
print(" ⏳ Fetching custom metrics...")
|
||||
usage_data['custom_metrics'] = analyzer.get_custom_metrics()
|
||||
|
||||
print(" ⏳ Fetching log usage...")
|
||||
usage_data['logs'] = analyzer.get_log_usage()
|
||||
|
||||
print(" ⏳ Finding unused monitors...")
|
||||
usage_data['unused_monitors'] = analyzer.get_unused_monitors()
|
||||
|
||||
# Calculate costs
|
||||
costs = analyzer.calculate_costs(usage_data)
|
||||
|
||||
# Generate recommendations
|
||||
recommendations = analyzer.get_recommendations(usage_data)
|
||||
|
||||
# Print results
|
||||
print_usage_summary(usage_data)
|
||||
print_cost_breakdown(costs)
|
||||
print_recommendations(recommendations)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("✅ Analysis complete!")
|
||||
print("="*70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
297
scripts/health_check_validator.py
Normal file
297
scripts/health_check_validator.py
Normal file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate health check endpoints and analyze response quality.
|
||||
Checks: response time, status code, response format, dependencies.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
from typing import Dict, List, Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class HealthCheckValidator:
|
||||
def __init__(self, timeout: int = 5):
|
||||
self.timeout = timeout
|
||||
self.results = []
|
||||
|
||||
def validate_endpoint(self, url: str) -> Dict[str, Any]:
|
||||
"""Validate a health check endpoint."""
|
||||
result = {
|
||||
"url": url,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"checks": [],
|
||||
"warnings": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
# Make request
|
||||
start_time = time.time()
|
||||
response = requests.get(url, timeout=self.timeout, verify=True)
|
||||
response_time = time.time() - start_time
|
||||
|
||||
result["status_code"] = response.status_code
|
||||
result["response_time"] = response_time
|
||||
|
||||
# Check 1: Status code
|
||||
if response.status_code == 200:
|
||||
result["checks"].append("✅ Status code is 200")
|
||||
else:
|
||||
result["errors"].append(f"❌ Unexpected status code: {response.status_code} (expected 200)")
|
||||
|
||||
# Check 2: Response time
|
||||
if response_time < 1.0:
|
||||
result["checks"].append(f"✅ Response time: {response_time:.3f}s (< 1s)")
|
||||
elif response_time < 3.0:
|
||||
result["warnings"].append(f"⚠️ Slow response time: {response_time:.3f}s (should be < 1s)")
|
||||
else:
|
||||
result["errors"].append(f"❌ Very slow response time: {response_time:.3f}s (should be < 1s)")
|
||||
|
||||
# Check 3: Content type
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
if 'application/json' in content_type:
|
||||
result["checks"].append("✅ Content-Type is application/json")
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = response.json()
|
||||
result["response_data"] = data
|
||||
|
||||
# Check for common health check fields
|
||||
self._validate_json_structure(data, result)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
result["errors"].append("❌ Invalid JSON response")
|
||||
elif 'text/plain' in content_type:
|
||||
result["warnings"].append("⚠️ Content-Type is text/plain (JSON recommended)")
|
||||
result["response_data"] = response.text
|
||||
else:
|
||||
result["warnings"].append(f"⚠️ Unexpected Content-Type: {content_type}")
|
||||
|
||||
# Check 4: Response headers
|
||||
self._validate_headers(response.headers, result)
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
result["errors"].append(f"❌ Request timeout (> {self.timeout}s)")
|
||||
result["status_code"] = None
|
||||
result["response_time"] = None
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
result["errors"].append("❌ Connection error (endpoint unreachable)")
|
||||
result["status_code"] = None
|
||||
result["response_time"] = None
|
||||
|
||||
except requests.exceptions.SSLError:
|
||||
result["errors"].append("❌ SSL certificate validation failed")
|
||||
result["status_code"] = None
|
||||
result["response_time"] = None
|
||||
|
||||
except Exception as e:
|
||||
result["errors"].append(f"❌ Unexpected error: {str(e)}")
|
||||
result["status_code"] = None
|
||||
result["response_time"] = None
|
||||
|
||||
# Overall status
|
||||
if result["errors"]:
|
||||
result["overall_status"] = "UNHEALTHY"
|
||||
elif result["warnings"]:
|
||||
result["overall_status"] = "DEGRADED"
|
||||
else:
|
||||
result["overall_status"] = "HEALTHY"
|
||||
|
||||
return result
|
||||
|
||||
def _validate_json_structure(self, data: Dict[str, Any], result: Dict[str, Any]):
|
||||
"""Validate JSON health check structure."""
|
||||
# Check for status field
|
||||
if "status" in data:
|
||||
status = data["status"]
|
||||
if status in ["ok", "healthy", "up", "pass"]:
|
||||
result["checks"].append(f"✅ Status field present: '{status}'")
|
||||
else:
|
||||
result["warnings"].append(f"⚠️ Status field has unexpected value: '{status}'")
|
||||
else:
|
||||
result["warnings"].append("⚠️ Missing 'status' field (recommended)")
|
||||
|
||||
# Check for version/build info
|
||||
if any(key in data for key in ["version", "build", "commit", "timestamp"]):
|
||||
result["checks"].append("✅ Version/build information present")
|
||||
else:
|
||||
result["warnings"].append("⚠️ No version/build information (recommended)")
|
||||
|
||||
# Check for dependencies
|
||||
if "dependencies" in data or "checks" in data or "components" in data:
|
||||
result["checks"].append("✅ Dependency checks present")
|
||||
|
||||
# Validate dependency structure
|
||||
deps = data.get("dependencies") or data.get("checks") or data.get("components")
|
||||
if isinstance(deps, dict):
|
||||
unhealthy_deps = []
|
||||
for name, info in deps.items():
|
||||
if isinstance(info, dict):
|
||||
dep_status = info.get("status", "unknown")
|
||||
if dep_status not in ["ok", "healthy", "up", "pass"]:
|
||||
unhealthy_deps.append(name)
|
||||
elif isinstance(info, str):
|
||||
if info not in ["ok", "healthy", "up", "pass"]:
|
||||
unhealthy_deps.append(name)
|
||||
|
||||
if unhealthy_deps:
|
||||
result["warnings"].append(f"⚠️ Unhealthy dependencies: {', '.join(unhealthy_deps)}")
|
||||
else:
|
||||
result["checks"].append(f"✅ All dependencies healthy ({len(deps)} checked)")
|
||||
else:
|
||||
result["warnings"].append("⚠️ No dependency checks (recommended for production services)")
|
||||
|
||||
# Check for uptime/metrics
|
||||
if any(key in data for key in ["uptime", "metrics", "stats"]):
|
||||
result["checks"].append("✅ Metrics/stats present")
|
||||
|
||||
def _validate_headers(self, headers: Dict[str, str], result: Dict[str, Any]):
|
||||
"""Validate response headers."""
|
||||
# Check for caching headers
|
||||
cache_control = headers.get('Cache-Control', '')
|
||||
if 'no-cache' in cache_control or 'no-store' in cache_control:
|
||||
result["checks"].append("✅ Caching disabled (Cache-Control: no-cache)")
|
||||
else:
|
||||
result["warnings"].append("⚠️ Caching not explicitly disabled (add Cache-Control: no-cache)")
|
||||
|
||||
def validate_multiple(self, urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Validate multiple health check endpoints."""
|
||||
results = []
|
||||
for url in urls:
|
||||
print(f"🔍 Checking: {url}")
|
||||
result = self.validate_endpoint(url)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
def print_result(result: Dict[str, Any], verbose: bool = False):
|
||||
"""Print validation result."""
|
||||
status_emoji = {
|
||||
"HEALTHY": "✅",
|
||||
"DEGRADED": "⚠️",
|
||||
"UNHEALTHY": "❌"
|
||||
}
|
||||
|
||||
print("\n" + "="*60)
|
||||
emoji = status_emoji.get(result["overall_status"], "❓")
|
||||
print(f"{emoji} {result['overall_status']}: {result['url']}")
|
||||
print("="*60)
|
||||
|
||||
if result.get("status_code"):
|
||||
print(f"\n📊 Status Code: {result['status_code']}")
|
||||
print(f"⏱️ Response Time: {result['response_time']:.3f}s")
|
||||
|
||||
# Print checks
|
||||
if result["checks"]:
|
||||
print(f"\n✅ Passed Checks:")
|
||||
for check in result["checks"]:
|
||||
print(f" {check}")
|
||||
|
||||
# Print warnings
|
||||
if result["warnings"]:
|
||||
print(f"\n⚠️ Warnings:")
|
||||
for warning in result["warnings"]:
|
||||
print(f" {warning}")
|
||||
|
||||
# Print errors
|
||||
if result["errors"]:
|
||||
print(f"\n❌ Errors:")
|
||||
for error in result["errors"]:
|
||||
print(f" {error}")
|
||||
|
||||
# Print response data if verbose
|
||||
if verbose and "response_data" in result:
|
||||
print(f"\n📄 Response Data:")
|
||||
if isinstance(result["response_data"], dict):
|
||||
print(json.dumps(result["response_data"], indent=2))
|
||||
else:
|
||||
print(result["response_data"])
|
||||
|
||||
print("="*60)
|
||||
|
||||
|
||||
def print_summary(results: List[Dict[str, Any]]):
|
||||
"""Print summary of multiple validations."""
|
||||
print("\n" + "="*60)
|
||||
print("📊 HEALTH CHECK VALIDATION SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
healthy = sum(1 for r in results if r["overall_status"] == "HEALTHY")
|
||||
degraded = sum(1 for r in results if r["overall_status"] == "DEGRADED")
|
||||
unhealthy = sum(1 for r in results if r["overall_status"] == "UNHEALTHY")
|
||||
|
||||
print(f"\n✅ Healthy: {healthy}/{len(results)}")
|
||||
print(f"⚠️ Degraded: {degraded}/{len(results)}")
|
||||
print(f"❌ Unhealthy: {unhealthy}/{len(results)}")
|
||||
|
||||
if results:
|
||||
avg_response_time = sum(r.get("response_time", 0) for r in results if r.get("response_time")) / len(results)
|
||||
print(f"\n⏱️ Average Response Time: {avg_response_time:.3f}s")
|
||||
|
||||
print("="*60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate health check endpoints",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Check a single endpoint
|
||||
python3 health_check_validator.py https://api.example.com/health
|
||||
|
||||
# Check multiple endpoints
|
||||
python3 health_check_validator.py \\
|
||||
https://api.example.com/health \\
|
||||
https://api.example.com/readiness
|
||||
|
||||
# Verbose output with response data
|
||||
python3 health_check_validator.py https://api.example.com/health --verbose
|
||||
|
||||
# Custom timeout
|
||||
python3 health_check_validator.py https://api.example.com/health --timeout 10
|
||||
|
||||
Best Practices Checked:
|
||||
✓ Returns 200 status code
|
||||
✓ Response time < 1 second
|
||||
✓ Returns JSON format
|
||||
✓ Contains 'status' field
|
||||
✓ Includes version/build info
|
||||
✓ Checks dependencies
|
||||
✓ Includes metrics
|
||||
✓ Disables caching
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('urls', nargs='+', help='Health check endpoint URL(s)')
|
||||
parser.add_argument('--timeout', type=int, default=5, help='Request timeout in seconds (default: 5)')
|
||||
parser.add_argument('--verbose', action='store_true', help='Show detailed response data')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
validator = HealthCheckValidator(timeout=args.timeout)
|
||||
|
||||
results = validator.validate_multiple(args.urls)
|
||||
|
||||
# Print individual results
|
||||
for result in results:
|
||||
print_result(result, args.verbose)
|
||||
|
||||
# Print summary if multiple endpoints
|
||||
if len(results) > 1:
|
||||
print_summary(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
321
scripts/log_analyzer.py
Normal file
321
scripts/log_analyzer.py
Normal file
@@ -0,0 +1,321 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse and analyze logs for patterns, errors, and anomalies.
|
||||
Supports: error detection, frequency analysis, pattern matching.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from tabulate import tabulate
|
||||
except ImportError:
|
||||
tabulate = None
|
||||
|
||||
|
||||
class LogAnalyzer:
|
||||
# Common log level patterns
|
||||
LOG_LEVELS = {
|
||||
'ERROR': r'\b(ERROR|Error|error)\b',
|
||||
'WARN': r'\b(WARN|Warning|warn|warning)\b',
|
||||
'INFO': r'\b(INFO|Info|info)\b',
|
||||
'DEBUG': r'\b(DEBUG|Debug|debug)\b',
|
||||
'FATAL': r'\b(FATAL|Fatal|fatal|CRITICAL|Critical)\b'
|
||||
}
|
||||
|
||||
# Common error patterns
|
||||
ERROR_PATTERNS = {
|
||||
'exception': r'Exception|exception|EXCEPTION',
|
||||
'stack_trace': r'\s+at\s+.*\(.*:\d+\)',
|
||||
'http_error': r'\b[45]\d{2}\b', # 4xx and 5xx HTTP codes
|
||||
'timeout': r'timeout|timed out|TIMEOUT',
|
||||
'connection_refused': r'connection refused|ECONNREFUSED',
|
||||
'out_of_memory': r'OutOfMemoryError|OOM|out of memory',
|
||||
'null_pointer': r'NullPointerException|null pointer|NPE',
|
||||
'database_error': r'SQLException|database error|DB error'
|
||||
}
|
||||
|
||||
def __init__(self, log_file: str):
|
||||
self.log_file = log_file
|
||||
self.lines = []
|
||||
self.log_levels = Counter()
|
||||
self.error_patterns = Counter()
|
||||
self.timestamps = []
|
||||
|
||||
def parse_file(self) -> bool:
|
||||
"""Parse log file."""
|
||||
try:
|
||||
with open(self.log_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
self.lines = f.readlines()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading file: {e}")
|
||||
return False
|
||||
|
||||
def analyze_log_levels(self):
|
||||
"""Count log levels."""
|
||||
for line in self.lines:
|
||||
for level, pattern in self.LOG_LEVELS.items():
|
||||
if re.search(pattern, line):
|
||||
self.log_levels[level] += 1
|
||||
break # Count each line only once
|
||||
|
||||
def analyze_error_patterns(self):
|
||||
"""Detect common error patterns."""
|
||||
for line in self.lines:
|
||||
for pattern_name, pattern in self.ERROR_PATTERNS.items():
|
||||
if re.search(pattern, line, re.IGNORECASE):
|
||||
self.error_patterns[pattern_name] += 1
|
||||
|
||||
def extract_timestamps(self, timestamp_pattern: Optional[str] = None):
|
||||
"""Extract timestamps from logs."""
|
||||
if not timestamp_pattern:
|
||||
# Common timestamp patterns
|
||||
patterns = [
|
||||
r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}', # ISO format
|
||||
r'\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}', # Apache format
|
||||
r'\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', # Syslog format
|
||||
]
|
||||
else:
|
||||
patterns = [timestamp_pattern]
|
||||
|
||||
for line in self.lines:
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, line)
|
||||
if match:
|
||||
self.timestamps.append(match.group())
|
||||
break
|
||||
|
||||
def find_error_lines(self, context: int = 2) -> List[Dict[str, Any]]:
|
||||
"""Find error lines with context."""
|
||||
errors = []
|
||||
|
||||
for i, line in enumerate(self.lines):
|
||||
# Check if line contains error keywords
|
||||
is_error = any(re.search(pattern, line, re.IGNORECASE)
|
||||
for pattern in [self.LOG_LEVELS['ERROR'], self.LOG_LEVELS['FATAL']])
|
||||
|
||||
if is_error:
|
||||
# Get context lines
|
||||
start = max(0, i - context)
|
||||
end = min(len(self.lines), i + context + 1)
|
||||
context_lines = self.lines[start:end]
|
||||
|
||||
errors.append({
|
||||
'line_number': i + 1,
|
||||
'line': line.strip(),
|
||||
'context': ''.join(context_lines)
|
||||
})
|
||||
|
||||
return errors
|
||||
|
||||
def analyze_frequency(self, time_window_minutes: int = 5) -> Dict[str, Any]:
|
||||
"""Analyze log frequency over time."""
|
||||
if not self.timestamps:
|
||||
return {"error": "No timestamps found"}
|
||||
|
||||
# This is a simplified version - in production you'd parse actual timestamps
|
||||
total_lines = len(self.lines)
|
||||
if self.timestamps:
|
||||
time_span = len(self.timestamps)
|
||||
avg_per_window = total_lines / max(1, time_span / time_window_minutes)
|
||||
else:
|
||||
avg_per_window = 0
|
||||
|
||||
return {
|
||||
"total_lines": total_lines,
|
||||
"timestamps_found": len(self.timestamps),
|
||||
"avg_per_window": avg_per_window
|
||||
}
|
||||
|
||||
def extract_unique_messages(self, pattern: str) -> List[str]:
|
||||
"""Extract unique messages matching a pattern."""
|
||||
matches = []
|
||||
seen = set()
|
||||
|
||||
for line in self.lines:
|
||||
match = re.search(pattern, line, re.IGNORECASE)
|
||||
if match:
|
||||
msg = match.group() if match.lastindex is None else match.group(1)
|
||||
if msg not in seen:
|
||||
matches.append(msg)
|
||||
seen.add(msg)
|
||||
|
||||
return matches
|
||||
|
||||
def find_stack_traces(self) -> List[Dict[str, Any]]:
|
||||
"""Extract complete stack traces."""
|
||||
stack_traces = []
|
||||
current_trace = []
|
||||
in_trace = False
|
||||
|
||||
for i, line in enumerate(self.lines):
|
||||
# Start of stack trace
|
||||
if re.search(r'Exception|Error.*:', line):
|
||||
if current_trace:
|
||||
stack_traces.append({
|
||||
'line_start': i - len(current_trace) + 1,
|
||||
'trace': '\n'.join(current_trace)
|
||||
})
|
||||
current_trace = [line.strip()]
|
||||
in_trace = True
|
||||
# Stack trace continuation
|
||||
elif in_trace and re.search(r'^\s+at\s+', line):
|
||||
current_trace.append(line.strip())
|
||||
# End of stack trace
|
||||
elif in_trace:
|
||||
if current_trace:
|
||||
stack_traces.append({
|
||||
'line_start': i - len(current_trace) + 1,
|
||||
'trace': '\n'.join(current_trace)
|
||||
})
|
||||
current_trace = []
|
||||
in_trace = False
|
||||
|
||||
# Add last trace if exists
|
||||
if current_trace:
|
||||
stack_traces.append({
|
||||
'line_start': len(self.lines) - len(current_trace) + 1,
|
||||
'trace': '\n'.join(current_trace)
|
||||
})
|
||||
|
||||
return stack_traces
|
||||
|
||||
|
||||
def print_analysis_results(analyzer: LogAnalyzer, show_errors: bool = False,
|
||||
show_traces: bool = False):
|
||||
"""Print analysis results."""
|
||||
print("\n" + "="*60)
|
||||
print("📝 LOG ANALYSIS RESULTS")
|
||||
print("="*60)
|
||||
|
||||
print(f"\n📁 File: {analyzer.log_file}")
|
||||
print(f"📊 Total Lines: {len(analyzer.lines):,}")
|
||||
|
||||
# Log levels
|
||||
if analyzer.log_levels:
|
||||
print(f"\n{'='*60}")
|
||||
print("📊 LOG LEVEL DISTRIBUTION:")
|
||||
print(f"{'='*60}")
|
||||
|
||||
level_emoji = {
|
||||
'FATAL': '🔴',
|
||||
'ERROR': '❌',
|
||||
'WARN': '⚠️',
|
||||
'INFO': 'ℹ️',
|
||||
'DEBUG': '🐛'
|
||||
}
|
||||
|
||||
for level, count in analyzer.log_levels.most_common():
|
||||
emoji = level_emoji.get(level, '•')
|
||||
percentage = (count / len(analyzer.lines)) * 100
|
||||
print(f"{emoji} {level:10s}: {count:6,} ({percentage:5.1f}%)")
|
||||
|
||||
# Error patterns
|
||||
if analyzer.error_patterns:
|
||||
print(f"\n{'='*60}")
|
||||
print("🔍 ERROR PATTERNS DETECTED:")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for pattern, count in analyzer.error_patterns.most_common(10):
|
||||
print(f"• {pattern:20s}: {count:,} occurrences")
|
||||
|
||||
# Timestamps
|
||||
if analyzer.timestamps:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"⏰ Timestamps Found: {len(analyzer.timestamps):,}")
|
||||
print(f" First: {analyzer.timestamps[0]}")
|
||||
print(f" Last: {analyzer.timestamps[-1]}")
|
||||
|
||||
# Error lines
|
||||
if show_errors:
|
||||
errors = analyzer.find_error_lines(context=1)
|
||||
if errors:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"❌ ERROR LINES (showing first 10 of {len(errors)}):")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for error in errors[:10]:
|
||||
print(f"\nLine {error['line_number']}:")
|
||||
print(f" {error['line']}")
|
||||
|
||||
# Stack traces
|
||||
if show_traces:
|
||||
traces = analyzer.find_stack_traces()
|
||||
if traces:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📚 STACK TRACES FOUND: {len(traces)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for i, trace in enumerate(traces[:5], 1):
|
||||
print(f"\nTrace {i} (starting at line {trace['line_start']}):")
|
||||
print(trace['trace'])
|
||||
if i < len(traces):
|
||||
print("\n" + "-"*60)
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze log files for errors, patterns, and anomalies",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic analysis
|
||||
python3 log_analyzer.py application.log
|
||||
|
||||
# Show error lines with context
|
||||
python3 log_analyzer.py application.log --show-errors
|
||||
|
||||
# Show stack traces
|
||||
python3 log_analyzer.py application.log --show-traces
|
||||
|
||||
# Full analysis
|
||||
python3 log_analyzer.py application.log --show-errors --show-traces
|
||||
|
||||
Features:
|
||||
• Log level distribution (ERROR, WARN, INFO, DEBUG, FATAL)
|
||||
• Common error pattern detection
|
||||
• Timestamp extraction
|
||||
• Error line identification with context
|
||||
• Stack trace extraction
|
||||
• Frequency analysis
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('log_file', help='Path to log file')
|
||||
parser.add_argument('--show-errors', action='store_true', help='Show error lines')
|
||||
parser.add_argument('--show-traces', action='store_true', help='Show stack traces')
|
||||
parser.add_argument('--timestamp-pattern', help='Custom regex for timestamp extraction')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.log_file).exists():
|
||||
print(f"❌ File not found: {args.log_file}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"🔍 Analyzing log file: {args.log_file}")
|
||||
|
||||
analyzer = LogAnalyzer(args.log_file)
|
||||
|
||||
if not analyzer.parse_file():
|
||||
sys.exit(1)
|
||||
|
||||
# Perform analysis
|
||||
analyzer.analyze_log_levels()
|
||||
analyzer.analyze_error_patterns()
|
||||
analyzer.extract_timestamps(args.timestamp_pattern)
|
||||
|
||||
# Print results
|
||||
print_analysis_results(analyzer, args.show_errors, args.show_traces)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
365
scripts/slo_calculator.py
Normal file
365
scripts/slo_calculator.py
Normal file
@@ -0,0 +1,365 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Calculate SLO compliance, error budgets, and burn rates.
|
||||
Supports availability SLOs and latency SLOs.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
from tabulate import tabulate
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'tabulate' library not found. Install with: pip install tabulate")
|
||||
tabulate = None
|
||||
|
||||
|
||||
class SLOCalculator:
|
||||
# SLO targets and allowed downtime per period
|
||||
SLO_TARGETS = {
|
||||
"90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1}, # days
|
||||
"95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05},
|
||||
"99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01},
|
||||
"99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005},
|
||||
"99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001},
|
||||
"99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005},
|
||||
"99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001},
|
||||
}
|
||||
|
||||
def __init__(self, slo_target: float, period_days: int = 30):
|
||||
"""
|
||||
Initialize SLO calculator.
|
||||
|
||||
Args:
|
||||
slo_target: SLO target percentage (e.g., 99.9)
|
||||
period_days: Time period in days (default: 30)
|
||||
"""
|
||||
self.slo_target = slo_target
|
||||
self.period_days = period_days
|
||||
self.error_budget_minutes = self.calculate_error_budget_minutes()
|
||||
|
||||
def calculate_error_budget_minutes(self) -> float:
|
||||
"""Calculate error budget in minutes for the period."""
|
||||
total_minutes = self.period_days * 24 * 60
|
||||
allowed_error_rate = (100 - self.slo_target) / 100
|
||||
return total_minutes * allowed_error_rate
|
||||
|
||||
def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate availability SLO compliance.
|
||||
|
||||
Args:
|
||||
total_requests: Total number of requests
|
||||
failed_requests: Number of failed requests
|
||||
|
||||
Returns:
|
||||
Dict with SLO compliance metrics
|
||||
"""
|
||||
if total_requests == 0:
|
||||
return {
|
||||
"error": "No requests in the period",
|
||||
"slo_met": False
|
||||
}
|
||||
|
||||
success_rate = ((total_requests - failed_requests) / total_requests) * 100
|
||||
error_rate = (failed_requests / total_requests) * 100
|
||||
|
||||
# Calculate error budget consumption
|
||||
allowed_failures = total_requests * ((100 - self.slo_target) / 100)
|
||||
error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf')
|
||||
error_budget_remaining = max(0, 100 - error_budget_consumed)
|
||||
|
||||
# Determine if SLO is met
|
||||
slo_met = success_rate >= self.slo_target
|
||||
|
||||
return {
|
||||
"slo_target": self.slo_target,
|
||||
"period_days": self.period_days,
|
||||
"total_requests": total_requests,
|
||||
"failed_requests": failed_requests,
|
||||
"success_requests": total_requests - failed_requests,
|
||||
"success_rate": success_rate,
|
||||
"error_rate": error_rate,
|
||||
"slo_met": slo_met,
|
||||
"error_budget_total": allowed_failures,
|
||||
"error_budget_consumed": error_budget_consumed,
|
||||
"error_budget_remaining": error_budget_remaining,
|
||||
"margin": success_rate - self.slo_target
|
||||
}
|
||||
|
||||
def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate latency SLO compliance.
|
||||
|
||||
Args:
|
||||
total_requests: Total number of requests
|
||||
requests_exceeding_threshold: Number of requests exceeding latency threshold
|
||||
|
||||
Returns:
|
||||
Dict with SLO compliance metrics
|
||||
"""
|
||||
if total_requests == 0:
|
||||
return {
|
||||
"error": "No requests in the period",
|
||||
"slo_met": False
|
||||
}
|
||||
|
||||
within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100
|
||||
|
||||
# Calculate error budget consumption
|
||||
allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100)
|
||||
error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf')
|
||||
error_budget_remaining = max(0, 100 - error_budget_consumed)
|
||||
|
||||
slo_met = within_threshold_rate >= self.slo_target
|
||||
|
||||
return {
|
||||
"slo_target": self.slo_target,
|
||||
"period_days": self.period_days,
|
||||
"total_requests": total_requests,
|
||||
"requests_exceeding_threshold": requests_exceeding_threshold,
|
||||
"requests_within_threshold": total_requests - requests_exceeding_threshold,
|
||||
"within_threshold_rate": within_threshold_rate,
|
||||
"slo_met": slo_met,
|
||||
"error_budget_total": allowed_slow_requests,
|
||||
"error_budget_consumed": error_budget_consumed,
|
||||
"error_budget_remaining": error_budget_remaining,
|
||||
"margin": within_threshold_rate - self.slo_target
|
||||
}
|
||||
|
||||
def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate error budget burn rate.
|
||||
|
||||
Args:
|
||||
errors_in_window: Number of errors in the time window
|
||||
requests_in_window: Total requests in the time window
|
||||
window_hours: Size of the time window in hours
|
||||
|
||||
Returns:
|
||||
Dict with burn rate metrics
|
||||
"""
|
||||
if requests_in_window == 0:
|
||||
return {"error": "No requests in window"}
|
||||
|
||||
# Calculate actual error rate in this window
|
||||
actual_error_rate = (errors_in_window / requests_in_window) * 100
|
||||
|
||||
# Calculate allowed error rate for SLO
|
||||
allowed_error_rate = 100 - self.slo_target
|
||||
|
||||
# Burn rate = actual error rate / allowed error rate
|
||||
burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf')
|
||||
|
||||
# Calculate time to exhaustion
|
||||
if burn_rate > 0:
|
||||
error_budget_hours = self.error_budget_minutes / 60
|
||||
hours_to_exhaustion = error_budget_hours / burn_rate
|
||||
else:
|
||||
hours_to_exhaustion = float('inf')
|
||||
|
||||
# Determine severity
|
||||
if burn_rate >= 14.4: # 1 hour window, burns budget in 2 days
|
||||
severity = "critical"
|
||||
elif burn_rate >= 6: # 6 hour window, burns budget in 5 days
|
||||
severity = "warning"
|
||||
elif burn_rate >= 1:
|
||||
severity = "elevated"
|
||||
else:
|
||||
severity = "normal"
|
||||
|
||||
return {
|
||||
"window_hours": window_hours,
|
||||
"requests_in_window": requests_in_window,
|
||||
"errors_in_window": errors_in_window,
|
||||
"actual_error_rate": actual_error_rate,
|
||||
"allowed_error_rate": allowed_error_rate,
|
||||
"burn_rate": burn_rate,
|
||||
"hours_to_exhaustion": hours_to_exhaustion,
|
||||
"severity": severity
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def print_slo_table():
|
||||
"""Print table of common SLO targets and allowed downtime."""
|
||||
if not tabulate:
|
||||
print("Install tabulate for formatted output: pip install tabulate")
|
||||
return
|
||||
|
||||
print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME")
|
||||
print("="*60)
|
||||
|
||||
headers = ["SLO", "Year", "Month", "Week", "Day"]
|
||||
rows = []
|
||||
|
||||
for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True):
|
||||
row = [
|
||||
f"{slo}%",
|
||||
f"{downtimes['year']:.2f} days",
|
||||
f"{downtimes['month']:.2f} days",
|
||||
f"{downtimes['week']:.2f} days",
|
||||
f"{downtimes['day']:.2f} days"
|
||||
]
|
||||
rows.append(row)
|
||||
|
||||
print(tabulate(rows, headers=headers, tablefmt="grid"))
|
||||
|
||||
|
||||
def print_availability_results(results: Dict[str, Any]):
|
||||
"""Print availability SLO results."""
|
||||
print("\n" + "="*60)
|
||||
print("📊 AVAILABILITY SLO COMPLIANCE")
|
||||
print("="*60)
|
||||
|
||||
if "error" in results:
|
||||
print(f"\n❌ Error: {results['error']}")
|
||||
return
|
||||
|
||||
status_emoji = "✅" if results['slo_met'] else "❌"
|
||||
print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}")
|
||||
print(f" Target: {results['slo_target']}%")
|
||||
print(f" Actual: {results['success_rate']:.3f}%")
|
||||
print(f" Margin: {results['margin']:+.3f}%")
|
||||
|
||||
print(f"\n📈 Request Statistics:")
|
||||
print(f" Total Requests: {results['total_requests']:,}")
|
||||
print(f" Successful: {results['success_requests']:,}")
|
||||
print(f" Failed: {results['failed_requests']:,}")
|
||||
print(f" Error Rate: {results['error_rate']:.3f}%")
|
||||
|
||||
print(f"\n💰 Error Budget:")
|
||||
budget_emoji = "✅" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else "❌"
|
||||
print(f" {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%")
|
||||
print(f" Consumed: {results['error_budget_consumed']:.1f}%")
|
||||
print(f" Allowed Failures: {results['error_budget_total']:.0f}")
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
|
||||
def print_burn_rate_results(results: Dict[str, Any]):
|
||||
"""Print burn rate results."""
|
||||
print("\n" + "="*60)
|
||||
print("🔥 ERROR BUDGET BURN RATE")
|
||||
print("="*60)
|
||||
|
||||
if "error" in results:
|
||||
print(f"\n❌ Error: {results['error']}")
|
||||
return
|
||||
|
||||
severity_emoji = {
|
||||
"critical": "🔴",
|
||||
"warning": "🟡",
|
||||
"elevated": "🟠",
|
||||
"normal": "🟢"
|
||||
}
|
||||
|
||||
print(f"\n{severity_emoji.get(results['severity'], '❓')} Severity: {results['severity'].upper()}")
|
||||
print(f" Burn Rate: {results['burn_rate']:.2f}x")
|
||||
print(f" Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)")
|
||||
|
||||
print(f"\n📊 Window Statistics:")
|
||||
print(f" Window: {results['window_hours']} hours")
|
||||
print(f" Requests: {results['requests_in_window']:,}")
|
||||
print(f" Errors: {results['errors_in_window']:,}")
|
||||
print(f" Actual Error Rate: {results['actual_error_rate']:.3f}%")
|
||||
print(f" Allowed Error Rate: {results['allowed_error_rate']:.3f}%")
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Calculate SLO compliance and error budgets",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Show SLO reference table
|
||||
python3 slo_calculator.py --table
|
||||
|
||||
# Calculate availability SLO
|
||||
python3 slo_calculator.py availability \\
|
||||
--slo 99.9 \\
|
||||
--total-requests 1000000 \\
|
||||
--failed-requests 1500 \\
|
||||
--period-days 30
|
||||
|
||||
# Calculate latency SLO
|
||||
python3 slo_calculator.py latency \\
|
||||
--slo 99.5 \\
|
||||
--total-requests 500000 \\
|
||||
--slow-requests 3000 \\
|
||||
--period-days 7
|
||||
|
||||
# Calculate burn rate
|
||||
python3 slo_calculator.py burn-rate \\
|
||||
--slo 99.9 \\
|
||||
--errors 50 \\
|
||||
--requests 10000 \\
|
||||
--window-hours 1
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'],
|
||||
help='Calculation mode')
|
||||
parser.add_argument('--table', action='store_true', help='Show SLO reference table')
|
||||
parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)')
|
||||
parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)')
|
||||
|
||||
# Availability SLO arguments
|
||||
parser.add_argument('--total-requests', type=int, help='Total number of requests')
|
||||
parser.add_argument('--failed-requests', type=int, help='Number of failed requests')
|
||||
|
||||
# Latency SLO arguments
|
||||
parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold')
|
||||
|
||||
# Burn rate arguments
|
||||
parser.add_argument('--errors', type=int, help='Number of errors in window')
|
||||
parser.add_argument('--requests', type=int, help='Number of requests in window')
|
||||
parser.add_argument('--window-hours', type=float, help='Window size in hours')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Show table if requested
|
||||
if args.table:
|
||||
SLOCalculator.print_slo_table()
|
||||
return
|
||||
|
||||
if not args.mode:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if not args.slo:
|
||||
print("❌ --slo required")
|
||||
sys.exit(1)
|
||||
|
||||
calculator = SLOCalculator(args.slo, args.period_days)
|
||||
|
||||
if args.mode == 'availability':
|
||||
if not args.total_requests or args.failed_requests is None:
|
||||
print("❌ --total-requests and --failed-requests required")
|
||||
sys.exit(1)
|
||||
|
||||
results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests)
|
||||
print_availability_results(results)
|
||||
|
||||
elif args.mode == 'latency':
|
||||
if not args.total_requests or args.slow_requests is None:
|
||||
print("❌ --total-requests and --slow-requests required")
|
||||
sys.exit(1)
|
||||
|
||||
results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests)
|
||||
print_availability_results(results) # Same format
|
||||
|
||||
elif args.mode == 'burn-rate':
|
||||
if not all([args.errors is not None, args.requests, args.window_hours]):
|
||||
print("❌ --errors, --requests, and --window-hours required")
|
||||
sys.exit(1)
|
||||
|
||||
results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours)
|
||||
print_burn_rate_results(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user