Initial commit
This commit is contained in:
315
scripts/alert_quality_checker.py
Normal file
315
scripts/alert_quality_checker.py
Normal file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Audit Prometheus alert rules against best practices.
|
||||
Checks for: alert naming, severity labels, runbook links, expression quality.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'PyYAML' library not found. Install with: pip install pyyaml")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class AlertQualityChecker:
|
||||
def __init__(self):
|
||||
self.issues = []
|
||||
self.warnings = []
|
||||
self.recommendations = []
|
||||
|
||||
def check_alert_name(self, alert_name: str) -> List[str]:
|
||||
"""Check alert naming conventions."""
|
||||
issues = []
|
||||
|
||||
# Should be PascalCase or camelCase
|
||||
if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name):
|
||||
issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)")
|
||||
|
||||
# Should be descriptive
|
||||
if len(alert_name) < 5:
|
||||
issues.append(f"Alert name '{alert_name}' is too short, use descriptive names")
|
||||
|
||||
# Avoid generic names
|
||||
generic_names = ['Alert', 'Test', 'Warning', 'Error']
|
||||
if alert_name in generic_names:
|
||||
issues.append(f"Alert name '{alert_name}' is too generic")
|
||||
|
||||
return issues
|
||||
|
||||
def check_labels(self, alert: Dict[str, Any]) -> List[str]:
|
||||
"""Check required and recommended labels."""
|
||||
issues = []
|
||||
labels = alert.get('labels', {})
|
||||
|
||||
# Required labels
|
||||
if 'severity' not in labels:
|
||||
issues.append("Missing required 'severity' label (critical/warning/info)")
|
||||
elif labels['severity'] not in ['critical', 'warning', 'info']:
|
||||
issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info")
|
||||
|
||||
# Recommended labels
|
||||
if 'team' not in labels:
|
||||
self.recommendations.append("Consider adding 'team' label for routing")
|
||||
|
||||
if 'component' not in labels and 'service' not in labels:
|
||||
self.recommendations.append("Consider adding 'component' or 'service' label")
|
||||
|
||||
return issues
|
||||
|
||||
def check_annotations(self, alert: Dict[str, Any]) -> List[str]:
|
||||
"""Check annotations quality."""
|
||||
issues = []
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
# Required annotations
|
||||
if 'summary' not in annotations:
|
||||
issues.append("Missing 'summary' annotation")
|
||||
elif len(annotations['summary']) < 10:
|
||||
issues.append("Summary annotation is too short, provide clear description")
|
||||
|
||||
if 'description' not in annotations:
|
||||
issues.append("Missing 'description' annotation")
|
||||
|
||||
# Runbook
|
||||
if 'runbook_url' not in annotations and 'runbook' not in annotations:
|
||||
self.recommendations.append("Consider adding 'runbook_url' for incident response")
|
||||
|
||||
# Check for templating
|
||||
if 'summary' in annotations:
|
||||
if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']:
|
||||
self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})")
|
||||
|
||||
return issues
|
||||
|
||||
def check_expression(self, expr: str, alert_name: str) -> List[str]:
|
||||
"""Check PromQL expression quality."""
|
||||
issues = []
|
||||
|
||||
# Should have a threshold
|
||||
if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr:
|
||||
issues.append("Expression should include a comparison operator")
|
||||
|
||||
# Should use rate() for counters
|
||||
if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr:
|
||||
self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)")
|
||||
|
||||
# Avoid instant queries without aggregation
|
||||
if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']):
|
||||
if expr.count('{') > 1: # Multiple metrics without aggregation
|
||||
self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.")
|
||||
|
||||
# Check for proper time windows
|
||||
if '[' not in expr and 'rate(' in expr:
|
||||
issues.append("rate() requires a time window (e.g., rate(metric[5m]))")
|
||||
|
||||
return issues
|
||||
|
||||
def check_for_duration(self, rule: Dict[str, Any]) -> List[str]:
|
||||
"""Check for 'for' clause to prevent flapping."""
|
||||
issues = []
|
||||
severity = rule.get('labels', {}).get('severity', 'unknown')
|
||||
|
||||
if 'for' not in rule:
|
||||
if severity == 'critical':
|
||||
issues.append("Critical alerts should have 'for' clause to prevent flapping")
|
||||
else:
|
||||
self.warnings.append("Consider adding 'for' clause to prevent alert flapping")
|
||||
else:
|
||||
# Parse duration
|
||||
duration = rule['for']
|
||||
if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']):
|
||||
self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts")
|
||||
|
||||
return issues
|
||||
|
||||
def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Check a single alert rule."""
|
||||
alert_name = rule.get('alert', 'Unknown')
|
||||
issues = []
|
||||
|
||||
# Check alert name
|
||||
issues.extend(self.check_alert_name(alert_name))
|
||||
|
||||
# Check expression
|
||||
if 'expr' not in rule:
|
||||
issues.append("Missing 'expr' field")
|
||||
else:
|
||||
issues.extend(self.check_expression(rule['expr'], alert_name))
|
||||
|
||||
# Check labels
|
||||
issues.extend(self.check_labels(rule))
|
||||
|
||||
# Check annotations
|
||||
issues.extend(self.check_annotations(rule))
|
||||
|
||||
# Check for duration
|
||||
issues.extend(self.check_for_duration(rule))
|
||||
|
||||
return {
|
||||
"alert": alert_name,
|
||||
"issues": issues,
|
||||
"severity": rule.get('labels', {}).get('severity', 'unknown')
|
||||
}
|
||||
|
||||
def analyze_file(self, filepath: str) -> Dict[str, Any]:
|
||||
"""Analyze a Prometheus rules file."""
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
return {"error": "Empty or invalid YAML file"}
|
||||
|
||||
results = []
|
||||
groups = data.get('groups', [])
|
||||
|
||||
for group in groups:
|
||||
group_name = group.get('name', 'Unknown')
|
||||
rules = group.get('rules', [])
|
||||
|
||||
for rule in rules:
|
||||
# Only check alerting rules, not recording rules
|
||||
if 'alert' in rule:
|
||||
result = self.check_alert_rule(rule)
|
||||
result['group'] = group_name
|
||||
results.append(result)
|
||||
|
||||
return {
|
||||
"file": filepath,
|
||||
"groups": len(groups),
|
||||
"alerts_checked": len(results),
|
||||
"results": results
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to parse file: {e}"}
|
||||
|
||||
|
||||
def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker):
|
||||
"""Pretty print analysis results."""
|
||||
print("\n" + "="*60)
|
||||
print("🚨 ALERT QUALITY CHECK RESULTS")
|
||||
print("="*60)
|
||||
|
||||
if "error" in analysis:
|
||||
print(f"\n❌ Error: {analysis['error']}")
|
||||
return
|
||||
|
||||
print(f"\n📁 File: {analysis['file']}")
|
||||
print(f"📊 Groups: {analysis['groups']}")
|
||||
print(f"🔔 Alerts Checked: {analysis['alerts_checked']}")
|
||||
|
||||
# Count issues by severity
|
||||
critical_count = 0
|
||||
warning_count = 0
|
||||
|
||||
for result in analysis['results']:
|
||||
if result['issues']:
|
||||
critical_count += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📈 Summary:")
|
||||
print(f" ❌ Alerts with Issues: {critical_count}")
|
||||
print(f" ⚠️ Warnings: {len(checker.warnings)}")
|
||||
print(f" 💡 Recommendations: {len(checker.recommendations)}")
|
||||
|
||||
# Print detailed results
|
||||
if critical_count > 0:
|
||||
print(f"\n{'='*60}")
|
||||
print("❌ ALERTS WITH ISSUES:")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for result in analysis['results']:
|
||||
if result['issues']:
|
||||
print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})")
|
||||
print(f" Severity: {result['severity']}")
|
||||
print(" Issues:")
|
||||
for issue in result['issues']:
|
||||
print(f" • {issue}")
|
||||
|
||||
# Print warnings
|
||||
if checker.warnings:
|
||||
print(f"\n{'='*60}")
|
||||
print("⚠️ WARNINGS:")
|
||||
print(f"{'='*60}")
|
||||
for warning in set(checker.warnings): # Remove duplicates
|
||||
print(f"• {warning}")
|
||||
|
||||
# Print recommendations
|
||||
if checker.recommendations:
|
||||
print(f"\n{'='*60}")
|
||||
print("💡 RECOMMENDATIONS:")
|
||||
print(f"{'='*60}")
|
||||
for rec in list(set(checker.recommendations))[:10]: # Top 10 unique recommendations
|
||||
print(f"• {rec}")
|
||||
|
||||
# Overall score
|
||||
total_alerts = analysis['alerts_checked']
|
||||
if total_alerts > 0:
|
||||
quality_score = ((total_alerts - critical_count) / total_alerts) * 100
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Audit Prometheus alert rules for quality and best practices",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Check a single file
|
||||
python3 alert_quality_checker.py alerts.yml
|
||||
|
||||
# Check all YAML files in a directory
|
||||
python3 alert_quality_checker.py /path/to/prometheus/rules/
|
||||
|
||||
Best Practices Checked:
|
||||
✓ Alert naming conventions (PascalCase, descriptive)
|
||||
✓ Required labels (severity)
|
||||
✓ Required annotations (summary, description)
|
||||
✓ Runbook URL presence
|
||||
✓ PromQL expression quality
|
||||
✓ 'for' clause to prevent flapping
|
||||
✓ Template variable usage
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('path', help='Path to alert rules file or directory')
|
||||
parser.add_argument('--verbose', action='store_true', help='Show all recommendations')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checker = AlertQualityChecker()
|
||||
|
||||
# Check if path is file or directory
|
||||
path = Path(args.path)
|
||||
|
||||
if path.is_file():
|
||||
files = [str(path)]
|
||||
elif path.is_dir():
|
||||
files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')]
|
||||
else:
|
||||
print(f"❌ Path not found: {args.path}")
|
||||
sys.exit(1)
|
||||
|
||||
if not files:
|
||||
print(f"❌ No YAML files found in: {args.path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"🔍 Checking {len(files)} file(s)...")
|
||||
|
||||
for filepath in files:
|
||||
analysis = checker.analyze_file(filepath)
|
||||
print_results(analysis, checker)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user