Initial commit

2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions
--- a/scripts/alert_quality_checker.py
+++ b/scripts/alert_quality_checker.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Audit Prometheus alert rules against best practices.
+Checks for: alert naming, severity labels, runbook links, expression quality.
+"""
+
+import argparse
+import sys
+import os
+import re
+from typing import Dict, List, Any
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("⚠️  Warning: 'PyYAML' library not found. Install with: pip install pyyaml")
+    sys.exit(1)
+
+
+class AlertQualityChecker:
+    def __init__(self):
+        self.issues = []
+        self.warnings = []
+        self.recommendations = []
+
+    def check_alert_name(self, alert_name: str) -> List[str]:
+        """Check alert naming conventions."""
+        issues = []
+
+        # Should be PascalCase or camelCase
+        if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name):
+            issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)")
+
+        # Should be descriptive
+        if len(alert_name) < 5:
+            issues.append(f"Alert name '{alert_name}' is too short, use descriptive names")
+
+        # Avoid generic names
+        generic_names = ['Alert', 'Test', 'Warning', 'Error']
+        if alert_name in generic_names:
+            issues.append(f"Alert name '{alert_name}' is too generic")
+
+        return issues
+
+    def check_labels(self, alert: Dict[str, Any]) -> List[str]:
+        """Check required and recommended labels."""
+        issues = []
+        labels = alert.get('labels', {})
+
+        # Required labels
+        if 'severity' not in labels:
+            issues.append("Missing required 'severity' label (critical/warning/info)")
+        elif labels['severity'] not in ['critical', 'warning', 'info']:
+            issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info")
+
+        # Recommended labels
+        if 'team' not in labels:
+            self.recommendations.append("Consider adding 'team' label for routing")
+
+        if 'component' not in labels and 'service' not in labels:
+            self.recommendations.append("Consider adding 'component' or 'service' label")
+
+        return issues
+
+    def check_annotations(self, alert: Dict[str, Any]) -> List[str]:
+        """Check annotations quality."""
+        issues = []
+        annotations = alert.get('annotations', {})
+
+        # Required annotations
+        if 'summary' not in annotations:
+            issues.append("Missing 'summary' annotation")
+        elif len(annotations['summary']) < 10:
+            issues.append("Summary annotation is too short, provide clear description")
+
+        if 'description' not in annotations:
+            issues.append("Missing 'description' annotation")
+
+        # Runbook
+        if 'runbook_url' not in annotations and 'runbook' not in annotations:
+            self.recommendations.append("Consider adding 'runbook_url' for incident response")
+
+        # Check for templating
+        if 'summary' in annotations:
+            if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']:
+                self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})")
+
+        return issues
+
+    def check_expression(self, expr: str, alert_name: str) -> List[str]:
+        """Check PromQL expression quality."""
+        issues = []
+
+        # Should have a threshold
+        if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr:
+            issues.append("Expression should include a comparison operator")
+
+        # Should use rate() for counters
+        if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr:
+            self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)")
+
+        # Avoid instant queries without aggregation
+        if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']):
+            if expr.count('{') > 1:  # Multiple metrics without aggregation
+                self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.")
+
+        # Check for proper time windows
+        if '[' not in expr and 'rate(' in expr:
+            issues.append("rate() requires a time window (e.g., rate(metric[5m]))")
+
+        return issues
+
+    def check_for_duration(self, rule: Dict[str, Any]) -> List[str]:
+        """Check for 'for' clause to prevent flapping."""
+        issues = []
+        severity = rule.get('labels', {}).get('severity', 'unknown')
+
+        if 'for' not in rule:
+            if severity == 'critical':
+                issues.append("Critical alerts should have 'for' clause to prevent flapping")
+            else:
+                self.warnings.append("Consider adding 'for' clause to prevent alert flapping")
+        else:
+            # Parse duration
+            duration = rule['for']
+            if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']):
+                self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts")
+
+        return issues
+
+    def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]:
+        """Check a single alert rule."""
+        alert_name = rule.get('alert', 'Unknown')
+        issues = []
+
+        # Check alert name
+        issues.extend(self.check_alert_name(alert_name))
+
+        # Check expression
+        if 'expr' not in rule:
+            issues.append("Missing 'expr' field")
+        else:
+            issues.extend(self.check_expression(rule['expr'], alert_name))
+
+        # Check labels
+        issues.extend(self.check_labels(rule))
+
+        # Check annotations
+        issues.extend(self.check_annotations(rule))
+
+        # Check for duration
+        issues.extend(self.check_for_duration(rule))
+
+        return {
+            "alert": alert_name,
+            "issues": issues,
+            "severity": rule.get('labels', {}).get('severity', 'unknown')
+        }
+
+    def analyze_file(self, filepath: str) -> Dict[str, Any]:
+        """Analyze a Prometheus rules file."""
+        try:
+            with open(filepath, 'r') as f:
+                data = yaml.safe_load(f)
+
+            if not data:
+                return {"error": "Empty or invalid YAML file"}
+
+            results = []
+            groups = data.get('groups', [])
+
+            for group in groups:
+                group_name = group.get('name', 'Unknown')
+                rules = group.get('rules', [])
+
+                for rule in rules:
+                    # Only check alerting rules, not recording rules
+                    if 'alert' in rule:
+                        result = self.check_alert_rule(rule)
+                        result['group'] = group_name
+                        results.append(result)
+
+            return {
+                "file": filepath,
+                "groups": len(groups),
+                "alerts_checked": len(results),
+                "results": results
+            }
+
+        except Exception as e:
+            return {"error": f"Failed to parse file: {e}"}
+
+
+def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker):
+    """Pretty print analysis results."""
+    print("\n" + "="*60)
+    print("🚨 ALERT QUALITY CHECK RESULTS")
+    print("="*60)
+
+    if "error" in analysis:
+        print(f"\n❌ Error: {analysis['error']}")
+        return
+
+    print(f"\n📁 File: {analysis['file']}")
+    print(f"📊 Groups: {analysis['groups']}")
+    print(f"🔔 Alerts Checked: {analysis['alerts_checked']}")
+
+    # Count issues by severity
+    critical_count = 0
+    warning_count = 0
+
+    for result in analysis['results']:
+        if result['issues']:
+            critical_count += 1
+
+    print(f"\n{'='*60}")
+    print(f"📈 Summary:")
+    print(f"   ❌ Alerts with Issues: {critical_count}")
+    print(f"   ⚠️  Warnings: {len(checker.warnings)}")
+    print(f"   💡 Recommendations: {len(checker.recommendations)}")
+
+    # Print detailed results
+    if critical_count > 0:
+        print(f"\n{'='*60}")
+        print("❌ ALERTS WITH ISSUES:")
+        print(f"{'='*60}")
+
+        for result in analysis['results']:
+            if result['issues']:
+                print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})")
+                print(f"   Severity: {result['severity']}")
+                print("   Issues:")
+                for issue in result['issues']:
+                    print(f"   • {issue}")
+
+    # Print warnings
+    if checker.warnings:
+        print(f"\n{'='*60}")
+        print("⚠️  WARNINGS:")
+        print(f"{'='*60}")
+        for warning in set(checker.warnings):  # Remove duplicates
+            print(f"• {warning}")
+
+    # Print recommendations
+    if checker.recommendations:
+        print(f"\n{'='*60}")
+        print("💡 RECOMMENDATIONS:")
+        print(f"{'='*60}")
+        for rec in list(set(checker.recommendations))[:10]:  # Top 10 unique recommendations
+            print(f"• {rec}")
+
+    # Overall score
+    total_alerts = analysis['alerts_checked']
+    if total_alerts > 0:
+        quality_score = ((total_alerts - critical_count) / total_alerts) * 100
+        print(f"\n{'='*60}")
+        print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)")
+        print(f"{'='*60}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Audit Prometheus alert rules for quality and best practices",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Check a single file
+  python3 alert_quality_checker.py alerts.yml
+
+  # Check all YAML files in a directory
+  python3 alert_quality_checker.py /path/to/prometheus/rules/
+
+Best Practices Checked:
+  ✓ Alert naming conventions (PascalCase, descriptive)
+  ✓ Required labels (severity)
+  ✓ Required annotations (summary, description)
+  ✓ Runbook URL presence
+  ✓ PromQL expression quality
+  ✓ 'for' clause to prevent flapping
+  ✓ Template variable usage
+        """
+    )
+
+    parser.add_argument('path', help='Path to alert rules file or directory')
+    parser.add_argument('--verbose', action='store_true', help='Show all recommendations')
+
+    args = parser.parse_args()
+
+    checker = AlertQualityChecker()
+
+    # Check if path is file or directory
+    path = Path(args.path)
+
+    if path.is_file():
+        files = [str(path)]
+    elif path.is_dir():
+        files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')]
+    else:
+        print(f"❌ Path not found: {args.path}")
+        sys.exit(1)
+
+    if not files:
+        print(f"❌ No YAML files found in: {args.path}")
+        sys.exit(1)
+
+    print(f"🔍 Checking {len(files)} file(s)...")
+
+    for filepath in files:
+        analysis = checker.analyze_file(filepath)
+        print_results(analysis, checker)
+
+
+if __name__ == "__main__":
+    main()