Initial commit

2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions
--- a/scripts/alert_quality_checker.py
+++ b/scripts/alert_quality_checker.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Audit Prometheus alert rules against best practices.
+Checks for: alert naming, severity labels, runbook links, expression quality.
+"""
+
+import argparse
+import sys
+import os
+import re
+from typing import Dict, List, Any
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("⚠️  Warning: 'PyYAML' library not found. Install with: pip install pyyaml")
+    sys.exit(1)
+
+
+class AlertQualityChecker:
+    def __init__(self):
+        self.issues = []
+        self.warnings = []
+        self.recommendations = []
+
+    def check_alert_name(self, alert_name: str) -> List[str]:
+        """Check alert naming conventions."""
+        issues = []
+
+        # Should be PascalCase or camelCase
+        if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name):
+            issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)")
+
+        # Should be descriptive
+        if len(alert_name) < 5:
+            issues.append(f"Alert name '{alert_name}' is too short, use descriptive names")
+
+        # Avoid generic names
+        generic_names = ['Alert', 'Test', 'Warning', 'Error']
+        if alert_name in generic_names:
+            issues.append(f"Alert name '{alert_name}' is too generic")
+
+        return issues
+
+    def check_labels(self, alert: Dict[str, Any]) -> List[str]:
+        """Check required and recommended labels."""
+        issues = []
+        labels = alert.get('labels', {})
+
+        # Required labels
+        if 'severity' not in labels:
+            issues.append("Missing required 'severity' label (critical/warning/info)")
+        elif labels['severity'] not in ['critical', 'warning', 'info']:
+            issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info")
+
+        # Recommended labels
+        if 'team' not in labels:
+            self.recommendations.append("Consider adding 'team' label for routing")
+
+        if 'component' not in labels and 'service' not in labels:
+            self.recommendations.append("Consider adding 'component' or 'service' label")
+
+        return issues
+
+    def check_annotations(self, alert: Dict[str, Any]) -> List[str]:
+        """Check annotations quality."""
+        issues = []
+        annotations = alert.get('annotations', {})
+
+        # Required annotations
+        if 'summary' not in annotations:
+            issues.append("Missing 'summary' annotation")
+        elif len(annotations['summary']) < 10:
+            issues.append("Summary annotation is too short, provide clear description")
+
+        if 'description' not in annotations:
+            issues.append("Missing 'description' annotation")
+
+        # Runbook
+        if 'runbook_url' not in annotations and 'runbook' not in annotations:
+            self.recommendations.append("Consider adding 'runbook_url' for incident response")
+
+        # Check for templating
+        if 'summary' in annotations:
+            if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']:
+                self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})")
+
+        return issues
+
+    def check_expression(self, expr: str, alert_name: str) -> List[str]:
+        """Check PromQL expression quality."""
+        issues = []
+
+        # Should have a threshold
+        if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr:
+            issues.append("Expression should include a comparison operator")
+
+        # Should use rate() for counters
+        if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr:
+            self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)")
+
+        # Avoid instant queries without aggregation
+        if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']):
+            if expr.count('{') > 1:  # Multiple metrics without aggregation
+                self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.")
+
+        # Check for proper time windows
+        if '[' not in expr and 'rate(' in expr:
+            issues.append("rate() requires a time window (e.g., rate(metric[5m]))")
+
+        return issues
+
+    def check_for_duration(self, rule: Dict[str, Any]) -> List[str]:
+        """Check for 'for' clause to prevent flapping."""
+        issues = []
+        severity = rule.get('labels', {}).get('severity', 'unknown')
+
+        if 'for' not in rule:
+            if severity == 'critical':
+                issues.append("Critical alerts should have 'for' clause to prevent flapping")
+            else:
+                self.warnings.append("Consider adding 'for' clause to prevent alert flapping")
+        else:
+            # Parse duration
+            duration = rule['for']
+            if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']):
+                self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts")
+
+        return issues
+
+    def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]:
+        """Check a single alert rule."""
+        alert_name = rule.get('alert', 'Unknown')
+        issues = []
+
+        # Check alert name
+        issues.extend(self.check_alert_name(alert_name))
+
+        # Check expression
+        if 'expr' not in rule:
+            issues.append("Missing 'expr' field")
+        else:
+            issues.extend(self.check_expression(rule['expr'], alert_name))
+
+        # Check labels
+        issues.extend(self.check_labels(rule))
+
+        # Check annotations
+        issues.extend(self.check_annotations(rule))
+
+        # Check for duration
+        issues.extend(self.check_for_duration(rule))
+
+        return {
+            "alert": alert_name,
+            "issues": issues,
+            "severity": rule.get('labels', {}).get('severity', 'unknown')
+        }
+
+    def analyze_file(self, filepath: str) -> Dict[str, Any]:
+        """Analyze a Prometheus rules file."""
+        try:
+            with open(filepath, 'r') as f:
+                data = yaml.safe_load(f)
+
+            if not data:
+                return {"error": "Empty or invalid YAML file"}
+
+            results = []
+            groups = data.get('groups', [])
+
+            for group in groups:
+                group_name = group.get('name', 'Unknown')
+                rules = group.get('rules', [])
+
+                for rule in rules:
+                    # Only check alerting rules, not recording rules
+                    if 'alert' in rule:
+                        result = self.check_alert_rule(rule)
+                        result['group'] = group_name
+                        results.append(result)
+
+            return {
+                "file": filepath,
+                "groups": len(groups),
+                "alerts_checked": len(results),
+                "results": results
+            }
+
+        except Exception as e:
+            return {"error": f"Failed to parse file: {e}"}
+
+
+def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker):
+    """Pretty print analysis results."""
+    print("\n" + "="*60)
+    print("🚨 ALERT QUALITY CHECK RESULTS")
+    print("="*60)
+
+    if "error" in analysis:
+        print(f"\n❌ Error: {analysis['error']}")
+        return
+
+    print(f"\n📁 File: {analysis['file']}")
+    print(f"📊 Groups: {analysis['groups']}")
+    print(f"🔔 Alerts Checked: {analysis['alerts_checked']}")
+
+    # Count issues by severity
+    critical_count = 0
+    warning_count = 0
+
+    for result in analysis['results']:
+        if result['issues']:
+            critical_count += 1
+
+    print(f"\n{'='*60}")
+    print(f"📈 Summary:")
+    print(f"   ❌ Alerts with Issues: {critical_count}")
+    print(f"   ⚠️  Warnings: {len(checker.warnings)}")
+    print(f"   💡 Recommendations: {len(checker.recommendations)}")
+
+    # Print detailed results
+    if critical_count > 0:
+        print(f"\n{'='*60}")
+        print("❌ ALERTS WITH ISSUES:")
+        print(f"{'='*60}")
+
+        for result in analysis['results']:
+            if result['issues']:
+                print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})")
+                print(f"   Severity: {result['severity']}")
+                print("   Issues:")
+                for issue in result['issues']:
+                    print(f"   • {issue}")
+
+    # Print warnings
+    if checker.warnings:
+        print(f"\n{'='*60}")
+        print("⚠️  WARNINGS:")
+        print(f"{'='*60}")
+        for warning in set(checker.warnings):  # Remove duplicates
+            print(f"• {warning}")
+
+    # Print recommendations
+    if checker.recommendations:
+        print(f"\n{'='*60}")
+        print("💡 RECOMMENDATIONS:")
+        print(f"{'='*60}")
+        for rec in list(set(checker.recommendations))[:10]:  # Top 10 unique recommendations
+            print(f"• {rec}")
+
+    # Overall score
+    total_alerts = analysis['alerts_checked']
+    if total_alerts > 0:
+        quality_score = ((total_alerts - critical_count) / total_alerts) * 100
+        print(f"\n{'='*60}")
+        print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)")
+        print(f"{'='*60}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Audit Prometheus alert rules for quality and best practices",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Check a single file
+  python3 alert_quality_checker.py alerts.yml
+
+  # Check all YAML files in a directory
+  python3 alert_quality_checker.py /path/to/prometheus/rules/
+
+Best Practices Checked:
+  ✓ Alert naming conventions (PascalCase, descriptive)
+  ✓ Required labels (severity)
+  ✓ Required annotations (summary, description)
+  ✓ Runbook URL presence
+  ✓ PromQL expression quality
+  ✓ 'for' clause to prevent flapping
+  ✓ Template variable usage
+        """
+    )
+
+    parser.add_argument('path', help='Path to alert rules file or directory')
+    parser.add_argument('--verbose', action='store_true', help='Show all recommendations')
+
+    args = parser.parse_args()
+
+    checker = AlertQualityChecker()
+
+    # Check if path is file or directory
+    path = Path(args.path)
+
+    if path.is_file():
+        files = [str(path)]
+    elif path.is_dir():
+        files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')]
+    else:
+        print(f"❌ Path not found: {args.path}")
+        sys.exit(1)
+
+    if not files:
+        print(f"❌ No YAML files found in: {args.path}")
+        sys.exit(1)
+
+    print(f"🔍 Checking {len(files)} file(s)...")
+
+    for filepath in files:
+        analysis = checker.analyze_file(filepath)
+        print_results(analysis, checker)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/analyze_metrics.py
+++ b/scripts/analyze_metrics.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""
+Analyze metrics from Prometheus or CloudWatch and detect anomalies.
+Supports: rate of change analysis, spike detection, trend analysis.
+"""
+
+import argparse
+import sys
+import json
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional
+import statistics
+
+try:
+    import requests
+except ImportError:
+    print("⚠️  Warning: 'requests' library not found. Install with: pip install requests")
+    sys.exit(1)
+
+try:
+    import boto3
+except ImportError:
+    boto3 = None
+
+
+class MetricAnalyzer:
+    def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"):
+        self.source = source
+        self.endpoint = endpoint
+        self.region = region
+        if source == "cloudwatch" and boto3:
+            self.cloudwatch = boto3.client('cloudwatch', region_name=region)
+        elif source == "cloudwatch" and not boto3:
+            print("⚠️  boto3 not installed. Install with: pip install boto3")
+            sys.exit(1)
+
+    def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]:
+        """Query Prometheus for metric data."""
+        if not self.endpoint:
+            print("❌ Prometheus endpoint required")
+            sys.exit(1)
+
+        try:
+            # Query range for last N hours
+            end_time = datetime.now()
+            start_time = end_time - timedelta(hours=hours)
+
+            params = {
+                'query': query,
+                'start': start_time.timestamp(),
+                'end': end_time.timestamp(),
+                'step': '5m'  # 5-minute resolution
+            }
+
+            response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30)
+            response.raise_for_status()
+
+            data = response.json()
+            if data['status'] != 'success':
+                print(f"❌ Prometheus query failed: {data}")
+                return []
+
+            return data['data']['result']
+
+        except Exception as e:
+            print(f"❌ Error querying Prometheus: {e}")
+            return []
+
+    def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str],
+                         hours: int = 24, stat: str = "Average") -> List[Dict]:
+        """Query CloudWatch for metric data."""
+        try:
+            end_time = datetime.now()
+            start_time = end_time - timedelta(hours=hours)
+
+            dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
+
+            response = self.cloudwatch.get_metric_statistics(
+                Namespace=namespace,
+                MetricName=metric_name,
+                Dimensions=dimensions_list,
+                StartTime=start_time,
+                EndTime=end_time,
+                Period=300,  # 5-minute intervals
+                Statistics=[stat]
+            )
+
+            return sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
+
+        except Exception as e:
+            print(f"❌ Error querying CloudWatch: {e}")
+            return []
+
+    def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]:
+        """Detect anomalies using standard deviation method."""
+        if len(values) < 10:
+            return {
+                "anomalies_detected": False,
+                "message": "Insufficient data points for anomaly detection"
+            }
+
+        mean = statistics.mean(values)
+        stdev = statistics.stdev(values)
+        threshold_upper = mean + (sensitivity * stdev)
+        threshold_lower = mean - (sensitivity * stdev)
+
+        anomalies = []
+        for i, value in enumerate(values):
+            if value > threshold_upper or value < threshold_lower:
+                anomalies.append({
+                    "index": i,
+                    "value": value,
+                    "deviation": abs(value - mean) / stdev if stdev > 0 else 0
+                })
+
+        return {
+            "anomalies_detected": len(anomalies) > 0,
+            "count": len(anomalies),
+            "anomalies": anomalies,
+            "stats": {
+                "mean": mean,
+                "stdev": stdev,
+                "threshold_upper": threshold_upper,
+                "threshold_lower": threshold_lower,
+                "total_points": len(values)
+            }
+        }
+
+    def analyze_trend(self, values: List[float]) -> Dict[str, Any]:
+        """Analyze trend using simple linear regression."""
+        if len(values) < 2:
+            return {"trend": "unknown", "message": "Insufficient data"}
+
+        n = len(values)
+        x = list(range(n))
+        x_mean = sum(x) / n
+        y_mean = sum(values) / n
+
+        numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
+        denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
+
+        if denominator == 0:
+            return {"trend": "flat", "slope": 0}
+
+        slope = numerator / denominator
+
+        # Determine trend direction
+        if abs(slope) < 0.01 * y_mean:  # Less than 1% change per interval
+            trend = "stable"
+        elif slope > 0:
+            trend = "increasing"
+        else:
+            trend = "decreasing"
+
+        return {
+            "trend": trend,
+            "slope": slope,
+            "rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0
+        }
+
+
+def print_results(results: Dict[str, Any]):
+    """Pretty print analysis results."""
+    print("\n" + "="*60)
+    print("📊 METRIC ANALYSIS RESULTS")
+    print("="*60)
+
+    if "error" in results:
+        print(f"\n❌ Error: {results['error']}")
+        return
+
+    print(f"\n📈 Data Points: {results.get('data_points', 0)}")
+
+    # Trend analysis
+    if "trend" in results:
+        trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "❓")
+        print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}")
+        if "rate_of_change" in results["trend"]:
+            print(f"   Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval")
+
+    # Anomaly detection
+    if "anomalies" in results:
+        anomaly_data = results["anomalies"]
+        if anomaly_data["anomalies_detected"]:
+            print(f"\n⚠️  ANOMALIES DETECTED: {anomaly_data['count']}")
+            print(f"   Mean: {anomaly_data['stats']['mean']:.2f}")
+            print(f"   Std Dev: {anomaly_data['stats']['stdev']:.2f}")
+            print(f"   Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]")
+
+            print("\n   Top Anomalies:")
+            for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]:
+                print(f"   • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)")
+        else:
+            print("\n✅ No anomalies detected")
+
+    print("\n" + "="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze metrics from Prometheus or CloudWatch",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Prometheus: Analyze request rate
+  python3 analyze_metrics.py prometheus \\
+    --endpoint http://localhost:9090 \\
+    --query 'rate(http_requests_total[5m])' \\
+    --hours 24
+
+  # CloudWatch: Analyze CPU utilization
+  python3 analyze_metrics.py cloudwatch \\
+    --namespace AWS/EC2 \\
+    --metric CPUUtilization \\
+    --dimensions InstanceId=i-1234567890abcdef0 \\
+    --hours 48
+        """
+    )
+
+    parser.add_argument('source', choices=['prometheus', 'cloudwatch'],
+                       help='Metric source')
+    parser.add_argument('--endpoint', help='Prometheus endpoint URL')
+    parser.add_argument('--query', help='PromQL query')
+    parser.add_argument('--namespace', help='CloudWatch namespace')
+    parser.add_argument('--metric', help='CloudWatch metric name')
+    parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)')
+    parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)')
+    parser.add_argument('--sensitivity', type=float, default=2.0,
+                       help='Anomaly detection sensitivity (std deviations, default: 2.0)')
+    parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)')
+
+    args = parser.parse_args()
+
+    analyzer = MetricAnalyzer(args.source, args.endpoint, args.region)
+
+    # Query metrics
+    if args.source == 'prometheus':
+        if not args.query:
+            print("❌ --query required for Prometheus")
+            sys.exit(1)
+
+        print(f"🔍 Querying Prometheus: {args.query}")
+        results = analyzer.query_prometheus(args.query, args.hours)
+
+        if not results:
+            print("❌ No data returned")
+            sys.exit(1)
+
+        # Extract values from first result series
+        values = [float(v[1]) for v in results[0].get('values', [])]
+
+    elif args.source == 'cloudwatch':
+        if not all([args.namespace, args.metric, args.dimensions]):
+            print("❌ --namespace, --metric, and --dimensions required for CloudWatch")
+            sys.exit(1)
+
+        dims = dict(item.split('=') for item in args.dimensions.split(','))
+
+        print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}")
+        results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours)
+
+        if not results:
+            print("❌ No data returned")
+            sys.exit(1)
+
+        values = [point['Average'] for point in results]
+
+    # Analyze metrics
+    analysis_results = {
+        "data_points": len(values),
+        "trend": analyzer.analyze_trend(values),
+        "anomalies": analyzer.detect_anomalies(values, args.sensitivity)
+    }
+
+    print_results(analysis_results)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/dashboard_generator.py
+++ b/scripts/dashboard_generator.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+"""
+Generate Grafana dashboards from templates.
+Supports: web applications, Kubernetes, databases, Redis, and custom metrics.
+"""
+
+import argparse
+import sys
+import json
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+
+
+class DashboardGenerator:
+    def __init__(self, title: str, datasource: str = "Prometheus"):
+        self.title = title
+        self.datasource = datasource
+        self.dashboard = self._create_base_dashboard()
+        self.panel_id = 1
+        self.row_y = 0
+
+    def _create_base_dashboard(self) -> Dict[str, Any]:
+        """Create base dashboard structure."""
+        return {
+            "dashboard": {
+                "title": self.title,
+                "tags": [],
+                "timezone": "browser",
+                "schemaVersion": 16,
+                "version": 0,
+                "refresh": "30s",
+                "panels": [],
+                "templating": {
+                    "list": []
+                },
+                "time": {
+                    "from": "now-6h",
+                    "to": "now"
+                }
+            },
+            "overwrite": True
+        }
+
+    def add_variable(self, name: str, label: str, query: str):
+        """Add a template variable."""
+        variable = {
+            "name": name,
+            "label": label,
+            "type": "query",
+            "datasource": self.datasource,
+            "query": query,
+            "refresh": 1,
+            "regex": "",
+            "multi": False,
+            "includeAll": False
+        }
+        self.dashboard["dashboard"]["templating"]["list"].append(variable)
+
+    def add_row(self, title: str):
+        """Add a row panel."""
+        panel = {
+            "id": self.panel_id,
+            "type": "row",
+            "title": title,
+            "collapsed": False,
+            "gridPos": {"h": 1, "w": 24, "x": 0, "y": self.row_y}
+        }
+        self.dashboard["dashboard"]["panels"].append(panel)
+        self.panel_id += 1
+        self.row_y += 1
+
+    def add_graph(self, title: str, targets: List[Dict[str, str]], unit: str = "short",
+                  width: int = 12, height: int = 8):
+        """Add a graph panel."""
+        panel = {
+            "id": self.panel_id,
+            "type": "graph",
+            "title": title,
+            "datasource": self.datasource,
+            "targets": [
+                {
+                    "expr": target["query"],
+                    "legendFormat": target.get("legend", ""),
+                    "refId": chr(65 + i)  # A, B, C, etc.
+                }
+                for i, target in enumerate(targets)
+            ],
+            "gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y},
+            "yaxes": [
+                {"format": unit, "label": None, "show": True},
+                {"format": "short", "label": None, "show": True}
+            ],
+            "lines": True,
+            "fill": 1,
+            "linewidth": 2,
+            "legend": {
+                "show": True,
+                "alignAsTable": True,
+                "avg": True,
+                "current": True,
+                "max": True,
+                "min": False,
+                "total": False,
+                "values": True
+            }
+        }
+        self.dashboard["dashboard"]["panels"].append(panel)
+        self.panel_id += 1
+        self.row_y += height
+
+    def add_stat(self, title: str, query: str, unit: str = "short",
+                 width: int = 6, height: int = 4):
+        """Add a stat panel (single value)."""
+        panel = {
+            "id": self.panel_id,
+            "type": "stat",
+            "title": title,
+            "datasource": self.datasource,
+            "targets": [
+                {
+                    "expr": query,
+                    "refId": "A"
+                }
+            ],
+            "gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y},
+            "options": {
+                "graphMode": "area",
+                "orientation": "auto",
+                "reduceOptions": {
+                    "values": False,
+                    "calcs": ["lastNotNull"]
+                }
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "unit": unit,
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {"value": None, "color": "green"},
+                            {"value": 80, "color": "red"}
+                        ]
+                    }
+                }
+            }
+        }
+        self.dashboard["dashboard"]["panels"].append(panel)
+        self.panel_id += 1
+
+    def generate_webapp_dashboard(self, service: str):
+        """Generate dashboard for web application."""
+        self.add_variable("service", "Service", f"label_values({service}_http_requests_total, service)")
+
+        # Request metrics
+        self.add_row("Request Metrics")
+
+        self.add_graph(
+            "Request Rate",
+            [{"query": f'sum(rate({service}_http_requests_total[5m])) by (status)', "legend": "{{status}}"}],
+            unit="reqps",
+            width=12
+        )
+
+        self.add_graph(
+            "Request Latency (p50, p95, p99)",
+            [
+                {"query": f'histogram_quantile(0.50, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p50"},
+                {"query": f'histogram_quantile(0.95, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p95"},
+                {"query": f'histogram_quantile(0.99, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p99"}
+            ],
+            unit="s",
+            width=12
+        )
+
+        # Error rate
+        self.add_row("Errors")
+
+        self.add_graph(
+            "Error Rate (%)",
+            [{"query": f'sum(rate({service}_http_requests_total{{status=~"5.."}}[5m])) / sum(rate({service}_http_requests_total[5m])) * 100', "legend": "Error Rate"}],
+            unit="percent",
+            width=12
+        )
+
+        # Resource usage
+        self.add_row("Resource Usage")
+
+        self.add_graph(
+            "CPU Usage",
+            [{"query": f'sum(rate(process_cpu_seconds_total{{job="{service}"}}[5m])) * 100', "legend": "CPU %"}],
+            unit="percent",
+            width=12
+        )
+
+        self.add_graph(
+            "Memory Usage",
+            [{"query": f'process_resident_memory_bytes{{job="{service}"}}', "legend": "Memory"}],
+            unit="bytes",
+            width=12
+        )
+
+    def generate_kubernetes_dashboard(self, namespace: str):
+        """Generate dashboard for Kubernetes cluster."""
+        self.add_variable("namespace", "Namespace", f"label_values(kube_pod_info, namespace)")
+
+        # Cluster overview
+        self.add_row("Cluster Overview")
+
+        self.add_stat("Total Pods", f'count(kube_pod_info{{namespace="{namespace}"}})', width=6)
+        self.add_stat("Running Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Running"}})', width=6)
+        self.add_stat("Pending Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Pending"}})', width=6)
+        self.add_stat("Failed Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Failed"}})', width=6)
+
+        # Resource usage
+        self.add_row("Resource Usage")
+
+        self.add_graph(
+            "CPU Usage by Pod",
+            [{"query": f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "{{pod}}"}],
+            unit="percent",
+            width=12
+        )
+
+        self.add_graph(
+            "Memory Usage by Pod",
+            [{"query": f'sum(container_memory_usage_bytes{{namespace="{namespace}"}}) by (pod)', "legend": "{{pod}}"}],
+            unit="bytes",
+            width=12
+        )
+
+        # Network
+        self.add_row("Network")
+
+        self.add_graph(
+            "Network I/O",
+            [
+                {"query": f'sum(rate(container_network_receive_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Receive - {{pod}}"},
+                {"query": f'sum(rate(container_network_transmit_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Transmit - {{pod}}"}
+            ],
+            unit="Bps",
+            width=12
+        )
+
+    def generate_database_dashboard(self, db_type: str, instance: str):
+        """Generate dashboard for database (postgres/mysql)."""
+        if db_type == "postgres":
+            self._generate_postgres_dashboard(instance)
+        elif db_type == "mysql":
+            self._generate_mysql_dashboard(instance)
+
+    def _generate_postgres_dashboard(self, instance: str):
+        """Generate PostgreSQL dashboard."""
+        self.add_row("PostgreSQL Metrics")
+
+        self.add_graph(
+            "Connections",
+            [
+                {"query": f'pg_stat_database_numbackends{{instance="{instance}"}}', "legend": "{{datname}}"}
+            ],
+            unit="short",
+            width=12
+        )
+
+        self.add_graph(
+            "Transactions per Second",
+            [
+                {"query": f'rate(pg_stat_database_xact_commit{{instance="{instance}"}}[5m])', "legend": "Commits"},
+                {"query": f'rate(pg_stat_database_xact_rollback{{instance="{instance}"}}[5m])', "legend": "Rollbacks"}
+            ],
+            unit="tps",
+            width=12
+        )
+
+        self.add_graph(
+            "Query Duration (p95)",
+            [
+                {"query": f'histogram_quantile(0.95, rate(pg_stat_statements_total_time_bucket{{instance="{instance}"}}[5m]))', "legend": "p95"}
+            ],
+            unit="ms",
+            width=12
+        )
+
+    def _generate_mysql_dashboard(self, instance: str):
+        """Generate MySQL dashboard."""
+        self.add_row("MySQL Metrics")
+
+        self.add_graph(
+            "Connections",
+            [
+                {"query": f'mysql_global_status_threads_connected{{instance="{instance}"}}', "legend": "Connected"},
+                {"query": f'mysql_global_status_threads_running{{instance="{instance}"}}', "legend": "Running"}
+            ],
+            unit="short",
+            width=12
+        )
+
+        self.add_graph(
+            "Queries per Second",
+            [
+                {"query": f'rate(mysql_global_status_queries{{instance="{instance}"}}[5m])', "legend": "Queries"}
+            ],
+            unit="qps",
+            width=12
+        )
+
+    def save(self, output_file: str):
+        """Save dashboard to file."""
+        try:
+            with open(output_file, 'w') as f:
+                json.dump(self.dashboard, f, indent=2)
+            return True
+        except Exception as e:
+            print(f"❌ Error saving dashboard: {e}")
+            return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate Grafana dashboards from templates",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Web application dashboard
+  python3 dashboard_generator.py webapp \\
+    --title "My API Dashboard" \\
+    --service my_api \\
+    --output dashboard.json
+
+  # Kubernetes dashboard
+  python3 dashboard_generator.py kubernetes \\
+    --title "K8s Namespace" \\
+    --namespace production \\
+    --output k8s-dashboard.json
+
+  # Database dashboard
+  python3 dashboard_generator.py database \\
+    --title "PostgreSQL" \\
+    --db-type postgres \\
+    --instance db.example.com:5432 \\
+    --output db-dashboard.json
+        """
+    )
+
+    parser.add_argument('type', choices=['webapp', 'kubernetes', 'database'],
+                       help='Dashboard type')
+    parser.add_argument('--title', required=True, help='Dashboard title')
+    parser.add_argument('--output', required=True, help='Output file path')
+    parser.add_argument('--datasource', default='Prometheus', help='Data source name')
+
+    # Web app specific
+    parser.add_argument('--service', help='Service name (for webapp)')
+
+    # Kubernetes specific
+    parser.add_argument('--namespace', help='Kubernetes namespace')
+
+    # Database specific
+    parser.add_argument('--db-type', choices=['postgres', 'mysql'], help='Database type')
+    parser.add_argument('--instance', help='Database instance')
+
+    args = parser.parse_args()
+
+    print(f"🎨 Generating {args.type} dashboard: {args.title}")
+
+    generator = DashboardGenerator(args.title, args.datasource)
+
+    if args.type == 'webapp':
+        if not args.service:
+            print("❌ --service required for webapp dashboard")
+            sys.exit(1)
+        generator.generate_webapp_dashboard(args.service)
+
+    elif args.type == 'kubernetes':
+        if not args.namespace:
+            print("❌ --namespace required for kubernetes dashboard")
+            sys.exit(1)
+        generator.generate_kubernetes_dashboard(args.namespace)
+
+    elif args.type == 'database':
+        if not args.db_type or not args.instance:
+            print("❌ --db-type and --instance required for database dashboard")
+            sys.exit(1)
+        generator.generate_database_dashboard(args.db_type, args.instance)
+
+    if generator.save(args.output):
+        print(f"✅ Dashboard saved to: {args.output}")
+        print(f"\n📝 Import to Grafana:")
+        print(f"   1. Go to Grafana → Dashboards → Import")
+        print(f"   2. Upload {args.output}")
+        print(f"   3. Select datasource and save")
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/datadog_cost_analyzer.py
+++ b/scripts/datadog_cost_analyzer.py
@@ -0,0 +1,477 @@
+#!/usr/bin/env python3
+"""
+Analyze Datadog usage and identify cost optimization opportunities.
+Helps find waste in custom metrics, logs, APM, and infrastructure monitoring.
+"""
+
+import argparse
+import sys
+import os
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional
+from collections import defaultdict
+
+try:
+    import requests
+except ImportError:
+    print("⚠️  Warning: 'requests' library not found. Install with: pip install requests")
+    sys.exit(1)
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    tabulate = None
+
+
+class DatadogCostAnalyzer:
+    # Pricing (as of 2024-2025)
+    PRICING = {
+        'infrastructure_pro': 15,  # per host per month
+        'infrastructure_enterprise': 23,
+        'custom_metric': 0.01,  # per metric per month (first 100 free per host)
+        'log_ingestion': 0.10,  # per GB ingested per month
+        'apm_host': 31,  # APM Pro per host per month
+        'apm_span': 1.70,  # per million indexed spans
+    }
+
+    def __init__(self, api_key: str, app_key: str, site: str = "datadoghq.com"):
+        self.api_key = api_key
+        self.app_key = app_key
+        self.site = site
+        self.base_url = f"https://api.{site}"
+        self.headers = {
+            'DD-API-KEY': api_key,
+            'DD-APPLICATION-KEY': app_key,
+            'Content-Type': 'application/json'
+        }
+
+    def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
+        """Make API request to Datadog."""
+        try:
+            url = f"{self.base_url}{endpoint}"
+            response = requests.get(url, headers=self.headers, params=params, timeout=30)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            print(f"❌ API Error: {e}")
+            return {}
+
+    def get_usage_metrics(self, start_date: str, end_date: str) -> Dict[str, Any]:
+        """Get usage metrics for specified date range."""
+        endpoint = "/api/v1/usage/summary"
+        params = {
+            'start_month': start_date,
+            'end_month': end_date,
+            'include_org_details': 'true'
+        }
+
+        data = self._make_request(endpoint, params)
+        return data.get('usage', [])
+
+    def get_custom_metrics(self) -> Dict[str, Any]:
+        """Get custom metrics usage and identify high-cardinality metrics."""
+        endpoint = "/api/v1/usage/timeseries"
+
+        # Get last 30 days
+        end_date = datetime.now()
+        start_date = end_date - timedelta(days=30)
+
+        params = {
+            'start_hr': int(start_date.timestamp()),
+            'end_hr': int(end_date.timestamp())
+        }
+
+        data = self._make_request(endpoint, params)
+
+        if not data:
+            return {'metrics': [], 'total_count': 0}
+
+        # Extract custom metrics info
+        usage_data = data.get('usage', [])
+
+        metrics_summary = {
+            'total_custom_metrics': 0,
+            'avg_custom_metrics': 0,
+            'billable_metrics': 0
+        }
+
+        for day in usage_data:
+            if 'timeseries' in day:
+                for ts in day['timeseries']:
+                    if ts.get('metric_category') == 'custom':
+                        metrics_summary['total_custom_metrics'] = max(
+                            metrics_summary['total_custom_metrics'],
+                            ts.get('num_custom_timeseries', 0)
+                        )
+
+        # Calculate billable (first 100 free)
+        metrics_summary['billable_metrics'] = max(0, metrics_summary['total_custom_metrics'] - 100)
+
+        return metrics_summary
+
+    def get_infrastructure_hosts(self) -> Dict[str, Any]:
+        """Get infrastructure host count and breakdown."""
+        endpoint = "/api/v1/usage/hosts"
+
+        end_date = datetime.now()
+        start_date = end_date - timedelta(days=30)
+
+        params = {
+            'start_hr': int(start_date.timestamp()),
+            'end_hr': int(end_date.timestamp())
+        }
+
+        data = self._make_request(endpoint, params)
+
+        if not data:
+            return {'total_hosts': 0}
+
+        usage = data.get('usage', [])
+
+        host_summary = {
+            'total_hosts': 0,
+            'agent_hosts': 0,
+            'aws_hosts': 0,
+            'azure_hosts': 0,
+            'gcp_hosts': 0,
+            'container_count': 0
+        }
+
+        for day in usage:
+            host_summary['total_hosts'] = max(host_summary['total_hosts'], day.get('host_count', 0))
+            host_summary['agent_hosts'] = max(host_summary['agent_hosts'], day.get('agent_host_count', 0))
+            host_summary['aws_hosts'] = max(host_summary['aws_hosts'], day.get('aws_host_count', 0))
+            host_summary['azure_hosts'] = max(host_summary['azure_hosts'], day.get('azure_host_count', 0))
+            host_summary['gcp_hosts'] = max(host_summary['gcp_hosts'], day.get('gcp_host_count', 0))
+            host_summary['container_count'] = max(host_summary['container_count'], day.get('container_count', 0))
+
+        return host_summary
+
+    def get_log_usage(self) -> Dict[str, Any]:
+        """Get log ingestion and retention usage."""
+        endpoint = "/api/v1/usage/logs"
+
+        end_date = datetime.now()
+        start_date = end_date - timedelta(days=30)
+
+        params = {
+            'start_hr': int(start_date.timestamp()),
+            'end_hr': int(end_date.timestamp())
+        }
+
+        data = self._make_request(endpoint, params)
+
+        if not data:
+            return {'total_gb': 0, 'daily_avg_gb': 0}
+
+        usage = data.get('usage', [])
+
+        total_ingested = 0
+        days_count = len(usage)
+
+        for day in usage:
+            total_ingested += day.get('ingested_events_bytes', 0)
+
+        total_gb = total_ingested / (1024**3)  # Convert to GB
+        daily_avg_gb = total_gb / max(days_count, 1)
+
+        return {
+            'total_gb': total_gb,
+            'daily_avg_gb': daily_avg_gb,
+            'monthly_projected_gb': daily_avg_gb * 30
+        }
+
+    def get_unused_monitors(self) -> List[Dict[str, Any]]:
+        """Find monitors that haven't alerted in 30+ days."""
+        endpoint = "/api/v1/monitor"
+
+        data = self._make_request(endpoint)
+
+        if not data:
+            return []
+
+        monitors = data if isinstance(data, list) else []
+
+        unused = []
+        now = datetime.now()
+
+        for monitor in monitors:
+            # Check if monitor has triggered recently
+            overall_state = monitor.get('overall_state')
+            modified = monitor.get('modified', '')
+
+            # If monitor has been in OK state and not modified in 30+ days
+            try:
+                if modified:
+                    mod_date = datetime.fromisoformat(modified.replace('Z', '+00:00'))
+                    days_since_modified = (now - mod_date.replace(tzinfo=None)).days
+
+                    if days_since_modified > 30 and overall_state in ['OK', 'No Data']:
+                        unused.append({
+                            'name': monitor.get('name', 'Unknown'),
+                            'id': monitor.get('id'),
+                            'days_since_modified': days_since_modified,
+                            'state': overall_state
+                        })
+            except:
+                pass
+
+        return unused
+
+    def calculate_costs(self, usage_data: Dict[str, Any]) -> Dict[str, float]:
+        """Calculate estimated monthly costs."""
+        costs = {
+            'infrastructure': 0,
+            'custom_metrics': 0,
+            'logs': 0,
+            'apm': 0,
+            'total': 0
+        }
+
+        # Infrastructure (assuming Pro tier)
+        if 'hosts' in usage_data:
+            costs['infrastructure'] = usage_data['hosts'].get('total_hosts', 0) * self.PRICING['infrastructure_pro']
+
+        # Custom metrics
+        if 'custom_metrics' in usage_data:
+            billable = usage_data['custom_metrics'].get('billable_metrics', 0)
+            costs['custom_metrics'] = billable * self.PRICING['custom_metric']
+
+        # Logs
+        if 'logs' in usage_data:
+            monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0)
+            costs['logs'] = monthly_gb * self.PRICING['log_ingestion']
+
+        costs['total'] = sum(costs.values())
+
+        return costs
+
+    def get_recommendations(self, usage_data: Dict[str, Any]) -> List[str]:
+        """Generate cost optimization recommendations."""
+        recommendations = []
+
+        # Custom metrics recommendations
+        if 'custom_metrics' in usage_data:
+            billable = usage_data['custom_metrics'].get('billable_metrics', 0)
+            if billable > 500:
+                savings = (billable * 0.3) * self.PRICING['custom_metric']  # Assume 30% reduction possible
+                recommendations.append({
+                    'category': 'Custom Metrics',
+                    'issue': f'High custom metric count: {billable:,} billable metrics',
+                    'action': 'Review metric tags for high cardinality, consider aggregating or dropping unused metrics',
+                    'potential_savings': f'${savings:.2f}/month'
+                })
+
+        # Container vs VM recommendations
+        if 'hosts' in usage_data:
+            hosts = usage_data['hosts'].get('total_hosts', 0)
+            containers = usage_data['hosts'].get('container_count', 0)
+
+            if containers > hosts * 10:  # Many containers per host
+                savings = hosts * 0.2 * self.PRICING['infrastructure_pro']
+                recommendations.append({
+                    'category': 'Infrastructure',
+                    'issue': f'{containers:,} containers running on {hosts} hosts',
+                    'action': 'Consider using container monitoring instead of host-based (can be 50-70% cheaper)',
+                    'potential_savings': f'${savings:.2f}/month'
+                })
+
+        # Unused monitors
+        if 'unused_monitors' in usage_data:
+            count = len(usage_data['unused_monitors'])
+            if count > 10:
+                recommendations.append({
+                    'category': 'Monitors',
+                    'issue': f'{count} monitors unused for 30+ days',
+                    'action': 'Delete or disable unused monitors to reduce noise and improve performance',
+                    'potential_savings': 'Operational efficiency'
+                })
+
+        # Log volume recommendations
+        if 'logs' in usage_data:
+            monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0)
+            if monthly_gb > 100:
+                savings = (monthly_gb * 0.4) * self.PRICING['log_ingestion']  # 40% reduction
+                recommendations.append({
+                    'category': 'Logs',
+                    'issue': f'High log volume: {monthly_gb:.1f} GB/month projected',
+                    'action': 'Review log sources, implement sampling for debug logs, exclude health checks',
+                    'potential_savings': f'${savings:.2f}/month'
+                })
+
+        # Migration recommendation if costs are high
+        costs = self.calculate_costs(usage_data)
+        if costs['total'] > 5000:
+            oss_cost = usage_data['hosts'].get('total_hosts', 0) * 15  # Rough estimate for self-hosted
+            savings = costs['total'] - oss_cost
+            recommendations.append({
+                'category': 'Strategic',
+                'issue': f'Total monthly cost: ${costs["total"]:.2f}',
+                'action': 'Consider migrating to open-source stack (Prometheus + Grafana + Loki)',
+                'potential_savings': f'${savings:.2f}/month (~{(savings/costs["total"]*100):.0f}% reduction)'
+            })
+
+        return recommendations
+
+
+def print_usage_summary(usage_data: Dict[str, Any]):
+    """Print usage summary."""
+    print("\n" + "="*70)
+    print("📊 DATADOG USAGE SUMMARY")
+    print("="*70)
+
+    # Infrastructure
+    if 'hosts' in usage_data:
+        hosts = usage_data['hosts']
+        print(f"\n🖥️  Infrastructure:")
+        print(f"   Total Hosts: {hosts.get('total_hosts', 0):,}")
+        print(f"   Agent Hosts: {hosts.get('agent_hosts', 0):,}")
+        print(f"   AWS Hosts: {hosts.get('aws_hosts', 0):,}")
+        print(f"   Azure Hosts: {hosts.get('azure_hosts', 0):,}")
+        print(f"   GCP Hosts: {hosts.get('gcp_hosts', 0):,}")
+        print(f"   Containers: {hosts.get('container_count', 0):,}")
+
+    # Custom Metrics
+    if 'custom_metrics' in usage_data:
+        metrics = usage_data['custom_metrics']
+        print(f"\n📈 Custom Metrics:")
+        print(f"   Total: {metrics.get('total_custom_metrics', 0):,}")
+        print(f"   Billable: {metrics.get('billable_metrics', 0):,} (first 100 free)")
+
+    # Logs
+    if 'logs' in usage_data:
+        logs = usage_data['logs']
+        print(f"\n📝 Logs:")
+        print(f"   Daily Average: {logs.get('daily_avg_gb', 0):.2f} GB")
+        print(f"   Monthly Projected: {logs.get('monthly_projected_gb', 0):.2f} GB")
+
+    # Unused Monitors
+    if 'unused_monitors' in usage_data:
+        print(f"\n🔔 Unused Monitors:")
+        print(f"   Count: {len(usage_data['unused_monitors'])}")
+
+
+def print_cost_breakdown(costs: Dict[str, float]):
+    """Print cost breakdown."""
+    print("\n" + "="*70)
+    print("💰 ESTIMATED MONTHLY COSTS")
+    print("="*70)
+
+    print(f"\n   Infrastructure Monitoring: ${costs['infrastructure']:,.2f}")
+    print(f"   Custom Metrics:            ${costs['custom_metrics']:,.2f}")
+    print(f"   Log Management:            ${costs['logs']:,.2f}")
+    print(f"   APM:                       ${costs['apm']:,.2f}")
+    print(f"   " + "-"*40)
+    print(f"   TOTAL:                     ${costs['total']:,.2f}/month")
+    print(f"                              ${costs['total']*12:,.2f}/year")
+
+
+def print_recommendations(recommendations: List[Dict]):
+    """Print recommendations."""
+    print("\n" + "="*70)
+    print("💡 COST OPTIMIZATION RECOMMENDATIONS")
+    print("="*70)
+
+    total_savings = 0
+
+    for i, rec in enumerate(recommendations, 1):
+        print(f"\n{i}. {rec['category']}")
+        print(f"   Issue: {rec['issue']}")
+        print(f"   Action: {rec['action']}")
+        print(f"   Potential Savings: {rec['potential_savings']}")
+
+        # Extract savings amount if it's a dollar value
+        if '$' in rec['potential_savings']:
+            try:
+                amount = float(rec['potential_savings'].replace('$', '').replace('/month', '').replace(',', ''))
+                total_savings += amount
+            except:
+                pass
+
+    if total_savings > 0:
+        print(f"\n{'='*70}")
+        print(f"💵 Total Potential Monthly Savings: ${total_savings:,.2f}")
+        print(f"💵 Total Potential Annual Savings:  ${total_savings*12:,.2f}")
+        print(f"{'='*70}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze Datadog usage and identify cost optimization opportunities",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Analyze current usage
+  python3 datadog_cost_analyzer.py \\
+    --api-key DD_API_KEY \\
+    --app-key DD_APP_KEY
+
+  # Use environment variables
+  export DD_API_KEY=your_api_key
+  export DD_APP_KEY=your_app_key
+  python3 datadog_cost_analyzer.py
+
+  # Specify site (for EU)
+  python3 datadog_cost_analyzer.py --site datadoghq.eu
+
+Required Datadog Permissions:
+  - usage_read
+  - monitors_read
+        """
+    )
+
+    parser.add_argument('--api-key',
+                       default=os.environ.get('DD_API_KEY'),
+                       help='Datadog API key (or set DD_API_KEY env var)')
+    parser.add_argument('--app-key',
+                       default=os.environ.get('DD_APP_KEY'),
+                       help='Datadog Application key (or set DD_APP_KEY env var)')
+    parser.add_argument('--site',
+                       default='datadoghq.com',
+                       help='Datadog site (default: datadoghq.com, EU: datadoghq.eu)')
+
+    args = parser.parse_args()
+
+    if not args.api_key or not args.app_key:
+        print("❌ Error: API key and Application key required")
+        print("   Set via --api-key and --app-key flags or DD_API_KEY and DD_APP_KEY env vars")
+        sys.exit(1)
+
+    print("🔍 Analyzing Datadog usage...")
+    print("   This may take 30-60 seconds...\n")
+
+    analyzer = DatadogCostAnalyzer(args.api_key, args.app_key, args.site)
+
+    # Gather usage data
+    usage_data = {}
+
+    print("   ⏳ Fetching infrastructure usage...")
+    usage_data['hosts'] = analyzer.get_infrastructure_hosts()
+
+    print("   ⏳ Fetching custom metrics...")
+    usage_data['custom_metrics'] = analyzer.get_custom_metrics()
+
+    print("   ⏳ Fetching log usage...")
+    usage_data['logs'] = analyzer.get_log_usage()
+
+    print("   ⏳ Finding unused monitors...")
+    usage_data['unused_monitors'] = analyzer.get_unused_monitors()
+
+    # Calculate costs
+    costs = analyzer.calculate_costs(usage_data)
+
+    # Generate recommendations
+    recommendations = analyzer.get_recommendations(usage_data)
+
+    # Print results
+    print_usage_summary(usage_data)
+    print_cost_breakdown(costs)
+    print_recommendations(recommendations)
+
+    print("\n" + "="*70)
+    print("✅ Analysis complete!")
+    print("="*70)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/health_check_validator.py
+++ b/scripts/health_check_validator.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""
+Validate health check endpoints and analyze response quality.
+Checks: response time, status code, response format, dependencies.
+"""
+
+import argparse
+import sys
+import time
+import json
+from typing import Dict, List, Any, Optional
+from urllib.parse import urlparse
+
+try:
+    import requests
+except ImportError:
+    print("⚠️  Warning: 'requests' library not found. Install with: pip install requests")
+    sys.exit(1)
+
+
+class HealthCheckValidator:
+    def __init__(self, timeout: int = 5):
+        self.timeout = timeout
+        self.results = []
+
+    def validate_endpoint(self, url: str) -> Dict[str, Any]:
+        """Validate a health check endpoint."""
+        result = {
+            "url": url,
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "checks": [],
+            "warnings": [],
+            "errors": []
+        }
+
+        try:
+            # Make request
+            start_time = time.time()
+            response = requests.get(url, timeout=self.timeout, verify=True)
+            response_time = time.time() - start_time
+
+            result["status_code"] = response.status_code
+            result["response_time"] = response_time
+
+            # Check 1: Status code
+            if response.status_code == 200:
+                result["checks"].append("✅ Status code is 200")
+            else:
+                result["errors"].append(f"❌ Unexpected status code: {response.status_code} (expected 200)")
+
+            # Check 2: Response time
+            if response_time < 1.0:
+                result["checks"].append(f"✅ Response time: {response_time:.3f}s (< 1s)")
+            elif response_time < 3.0:
+                result["warnings"].append(f"⚠️  Slow response time: {response_time:.3f}s (should be < 1s)")
+            else:
+                result["errors"].append(f"❌ Very slow response time: {response_time:.3f}s (should be < 1s)")
+
+            # Check 3: Content type
+            content_type = response.headers.get('Content-Type', '')
+            if 'application/json' in content_type:
+                result["checks"].append("✅ Content-Type is application/json")
+
+                # Try to parse JSON
+                try:
+                    data = response.json()
+                    result["response_data"] = data
+
+                    # Check for common health check fields
+                    self._validate_json_structure(data, result)
+
+                except json.JSONDecodeError:
+                    result["errors"].append("❌ Invalid JSON response")
+            elif 'text/plain' in content_type:
+                result["warnings"].append("⚠️  Content-Type is text/plain (JSON recommended)")
+                result["response_data"] = response.text
+            else:
+                result["warnings"].append(f"⚠️  Unexpected Content-Type: {content_type}")
+
+            # Check 4: Response headers
+            self._validate_headers(response.headers, result)
+
+        except requests.exceptions.Timeout:
+            result["errors"].append(f"❌ Request timeout (> {self.timeout}s)")
+            result["status_code"] = None
+            result["response_time"] = None
+
+        except requests.exceptions.ConnectionError:
+            result["errors"].append("❌ Connection error (endpoint unreachable)")
+            result["status_code"] = None
+            result["response_time"] = None
+
+        except requests.exceptions.SSLError:
+            result["errors"].append("❌ SSL certificate validation failed")
+            result["status_code"] = None
+            result["response_time"] = None
+
+        except Exception as e:
+            result["errors"].append(f"❌ Unexpected error: {str(e)}")
+            result["status_code"] = None
+            result["response_time"] = None
+
+        # Overall status
+        if result["errors"]:
+            result["overall_status"] = "UNHEALTHY"
+        elif result["warnings"]:
+            result["overall_status"] = "DEGRADED"
+        else:
+            result["overall_status"] = "HEALTHY"
+
+        return result
+
+    def _validate_json_structure(self, data: Dict[str, Any], result: Dict[str, Any]):
+        """Validate JSON health check structure."""
+        # Check for status field
+        if "status" in data:
+            status = data["status"]
+            if status in ["ok", "healthy", "up", "pass"]:
+                result["checks"].append(f"✅ Status field present: '{status}'")
+            else:
+                result["warnings"].append(f"⚠️  Status field has unexpected value: '{status}'")
+        else:
+            result["warnings"].append("⚠️  Missing 'status' field (recommended)")
+
+        # Check for version/build info
+        if any(key in data for key in ["version", "build", "commit", "timestamp"]):
+            result["checks"].append("✅ Version/build information present")
+        else:
+            result["warnings"].append("⚠️  No version/build information (recommended)")
+
+        # Check for dependencies
+        if "dependencies" in data or "checks" in data or "components" in data:
+            result["checks"].append("✅ Dependency checks present")
+
+            # Validate dependency structure
+            deps = data.get("dependencies") or data.get("checks") or data.get("components")
+            if isinstance(deps, dict):
+                unhealthy_deps = []
+                for name, info in deps.items():
+                    if isinstance(info, dict):
+                        dep_status = info.get("status", "unknown")
+                        if dep_status not in ["ok", "healthy", "up", "pass"]:
+                            unhealthy_deps.append(name)
+                    elif isinstance(info, str):
+                        if info not in ["ok", "healthy", "up", "pass"]:
+                            unhealthy_deps.append(name)
+
+                if unhealthy_deps:
+                    result["warnings"].append(f"⚠️  Unhealthy dependencies: {', '.join(unhealthy_deps)}")
+                else:
+                    result["checks"].append(f"✅ All dependencies healthy ({len(deps)} checked)")
+        else:
+            result["warnings"].append("⚠️  No dependency checks (recommended for production services)")
+
+        # Check for uptime/metrics
+        if any(key in data for key in ["uptime", "metrics", "stats"]):
+            result["checks"].append("✅ Metrics/stats present")
+
+    def _validate_headers(self, headers: Dict[str, str], result: Dict[str, Any]):
+        """Validate response headers."""
+        # Check for caching headers
+        cache_control = headers.get('Cache-Control', '')
+        if 'no-cache' in cache_control or 'no-store' in cache_control:
+            result["checks"].append("✅ Caching disabled (Cache-Control: no-cache)")
+        else:
+            result["warnings"].append("⚠️  Caching not explicitly disabled (add Cache-Control: no-cache)")
+
+    def validate_multiple(self, urls: List[str]) -> List[Dict[str, Any]]:
+        """Validate multiple health check endpoints."""
+        results = []
+        for url in urls:
+            print(f"🔍 Checking: {url}")
+            result = self.validate_endpoint(url)
+            results.append(result)
+        return results
+
+
+def print_result(result: Dict[str, Any], verbose: bool = False):
+    """Print validation result."""
+    status_emoji = {
+        "HEALTHY": "✅",
+        "DEGRADED": "⚠️",
+        "UNHEALTHY": "❌"
+    }
+
+    print("\n" + "="*60)
+    emoji = status_emoji.get(result["overall_status"], "❓")
+    print(f"{emoji} {result['overall_status']}: {result['url']}")
+    print("="*60)
+
+    if result.get("status_code"):
+        print(f"\n📊 Status Code: {result['status_code']}")
+        print(f"⏱️  Response Time: {result['response_time']:.3f}s")
+
+    # Print checks
+    if result["checks"]:
+        print(f"\n✅ Passed Checks:")
+        for check in result["checks"]:
+            print(f"   {check}")
+
+    # Print warnings
+    if result["warnings"]:
+        print(f"\n⚠️  Warnings:")
+        for warning in result["warnings"]:
+            print(f"   {warning}")
+
+    # Print errors
+    if result["errors"]:
+        print(f"\n❌ Errors:")
+        for error in result["errors"]:
+            print(f"   {error}")
+
+    # Print response data if verbose
+    if verbose and "response_data" in result:
+        print(f"\n📄 Response Data:")
+        if isinstance(result["response_data"], dict):
+            print(json.dumps(result["response_data"], indent=2))
+        else:
+            print(result["response_data"])
+
+    print("="*60)
+
+
+def print_summary(results: List[Dict[str, Any]]):
+    """Print summary of multiple validations."""
+    print("\n" + "="*60)
+    print("📊 HEALTH CHECK VALIDATION SUMMARY")
+    print("="*60)
+
+    healthy = sum(1 for r in results if r["overall_status"] == "HEALTHY")
+    degraded = sum(1 for r in results if r["overall_status"] == "DEGRADED")
+    unhealthy = sum(1 for r in results if r["overall_status"] == "UNHEALTHY")
+
+    print(f"\n✅ Healthy:   {healthy}/{len(results)}")
+    print(f"⚠️  Degraded:  {degraded}/{len(results)}")
+    print(f"❌ Unhealthy: {unhealthy}/{len(results)}")
+
+    if results:
+        avg_response_time = sum(r.get("response_time", 0) for r in results if r.get("response_time")) / len(results)
+        print(f"\n⏱️  Average Response Time: {avg_response_time:.3f}s")
+
+    print("="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate health check endpoints",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Check a single endpoint
+  python3 health_check_validator.py https://api.example.com/health
+
+  # Check multiple endpoints
+  python3 health_check_validator.py \\
+    https://api.example.com/health \\
+    https://api.example.com/readiness
+
+  # Verbose output with response data
+  python3 health_check_validator.py https://api.example.com/health --verbose
+
+  # Custom timeout
+  python3 health_check_validator.py https://api.example.com/health --timeout 10
+
+Best Practices Checked:
+  ✓ Returns 200 status code
+  ✓ Response time < 1 second
+  ✓ Returns JSON format
+  ✓ Contains 'status' field
+  ✓ Includes version/build info
+  ✓ Checks dependencies
+  ✓ Includes metrics
+  ✓ Disables caching
+        """
+    )
+
+    parser.add_argument('urls', nargs='+', help='Health check endpoint URL(s)')
+    parser.add_argument('--timeout', type=int, default=5, help='Request timeout in seconds (default: 5)')
+    parser.add_argument('--verbose', action='store_true', help='Show detailed response data')
+
+    args = parser.parse_args()
+
+    validator = HealthCheckValidator(timeout=args.timeout)
+
+    results = validator.validate_multiple(args.urls)
+
+    # Print individual results
+    for result in results:
+        print_result(result, args.verbose)
+
+    # Print summary if multiple endpoints
+    if len(results) > 1:
+        print_summary(results)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/log_analyzer.py
+++ b/scripts/log_analyzer.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+"""
+Parse and analyze logs for patterns, errors, and anomalies.
+Supports: error detection, frequency analysis, pattern matching.
+"""
+
+import argparse
+import sys
+import re
+import json
+from collections import Counter, defaultdict
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    tabulate = None
+
+
+class LogAnalyzer:
+    # Common log level patterns
+    LOG_LEVELS = {
+        'ERROR': r'\b(ERROR|Error|error)\b',
+        'WARN': r'\b(WARN|Warning|warn|warning)\b',
+        'INFO': r'\b(INFO|Info|info)\b',
+        'DEBUG': r'\b(DEBUG|Debug|debug)\b',
+        'FATAL': r'\b(FATAL|Fatal|fatal|CRITICAL|Critical)\b'
+    }
+
+    # Common error patterns
+    ERROR_PATTERNS = {
+        'exception': r'Exception|exception|EXCEPTION',
+        'stack_trace': r'\s+at\s+.*\(.*:\d+\)',
+        'http_error': r'\b[45]\d{2}\b',  # 4xx and 5xx HTTP codes
+        'timeout': r'timeout|timed out|TIMEOUT',
+        'connection_refused': r'connection refused|ECONNREFUSED',
+        'out_of_memory': r'OutOfMemoryError|OOM|out of memory',
+        'null_pointer': r'NullPointerException|null pointer|NPE',
+        'database_error': r'SQLException|database error|DB error'
+    }
+
+    def __init__(self, log_file: str):
+        self.log_file = log_file
+        self.lines = []
+        self.log_levels = Counter()
+        self.error_patterns = Counter()
+        self.timestamps = []
+
+    def parse_file(self) -> bool:
+        """Parse log file."""
+        try:
+            with open(self.log_file, 'r', encoding='utf-8', errors='ignore') as f:
+                self.lines = f.readlines()
+            return True
+        except Exception as e:
+            print(f"❌ Error reading file: {e}")
+            return False
+
+    def analyze_log_levels(self):
+        """Count log levels."""
+        for line in self.lines:
+            for level, pattern in self.LOG_LEVELS.items():
+                if re.search(pattern, line):
+                    self.log_levels[level] += 1
+                    break  # Count each line only once
+
+    def analyze_error_patterns(self):
+        """Detect common error patterns."""
+        for line in self.lines:
+            for pattern_name, pattern in self.ERROR_PATTERNS.items():
+                if re.search(pattern, line, re.IGNORECASE):
+                    self.error_patterns[pattern_name] += 1
+
+    def extract_timestamps(self, timestamp_pattern: Optional[str] = None):
+        """Extract timestamps from logs."""
+        if not timestamp_pattern:
+            # Common timestamp patterns
+            patterns = [
+                r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}',  # ISO format
+                r'\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}',  # Apache format
+                r'\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}',  # Syslog format
+            ]
+        else:
+            patterns = [timestamp_pattern]
+
+        for line in self.lines:
+            for pattern in patterns:
+                match = re.search(pattern, line)
+                if match:
+                    self.timestamps.append(match.group())
+                    break
+
+    def find_error_lines(self, context: int = 2) -> List[Dict[str, Any]]:
+        """Find error lines with context."""
+        errors = []
+
+        for i, line in enumerate(self.lines):
+            # Check if line contains error keywords
+            is_error = any(re.search(pattern, line, re.IGNORECASE)
+                          for pattern in [self.LOG_LEVELS['ERROR'], self.LOG_LEVELS['FATAL']])
+
+            if is_error:
+                # Get context lines
+                start = max(0, i - context)
+                end = min(len(self.lines), i + context + 1)
+                context_lines = self.lines[start:end]
+
+                errors.append({
+                    'line_number': i + 1,
+                    'line': line.strip(),
+                    'context': ''.join(context_lines)
+                })
+
+        return errors
+
+    def analyze_frequency(self, time_window_minutes: int = 5) -> Dict[str, Any]:
+        """Analyze log frequency over time."""
+        if not self.timestamps:
+            return {"error": "No timestamps found"}
+
+        # This is a simplified version - in production you'd parse actual timestamps
+        total_lines = len(self.lines)
+        if self.timestamps:
+            time_span = len(self.timestamps)
+            avg_per_window = total_lines / max(1, time_span / time_window_minutes)
+        else:
+            avg_per_window = 0
+
+        return {
+            "total_lines": total_lines,
+            "timestamps_found": len(self.timestamps),
+            "avg_per_window": avg_per_window
+        }
+
+    def extract_unique_messages(self, pattern: str) -> List[str]:
+        """Extract unique messages matching a pattern."""
+        matches = []
+        seen = set()
+
+        for line in self.lines:
+            match = re.search(pattern, line, re.IGNORECASE)
+            if match:
+                msg = match.group() if match.lastindex is None else match.group(1)
+                if msg not in seen:
+                    matches.append(msg)
+                    seen.add(msg)
+
+        return matches
+
+    def find_stack_traces(self) -> List[Dict[str, Any]]:
+        """Extract complete stack traces."""
+        stack_traces = []
+        current_trace = []
+        in_trace = False
+
+        for i, line in enumerate(self.lines):
+            # Start of stack trace
+            if re.search(r'Exception|Error.*:', line):
+                if current_trace:
+                    stack_traces.append({
+                        'line_start': i - len(current_trace) + 1,
+                        'trace': '\n'.join(current_trace)
+                    })
+                current_trace = [line.strip()]
+                in_trace = True
+            # Stack trace continuation
+            elif in_trace and re.search(r'^\s+at\s+', line):
+                current_trace.append(line.strip())
+            # End of stack trace
+            elif in_trace:
+                if current_trace:
+                    stack_traces.append({
+                        'line_start': i - len(current_trace) + 1,
+                        'trace': '\n'.join(current_trace)
+                    })
+                current_trace = []
+                in_trace = False
+
+        # Add last trace if exists
+        if current_trace:
+            stack_traces.append({
+                'line_start': len(self.lines) - len(current_trace) + 1,
+                'trace': '\n'.join(current_trace)
+            })
+
+        return stack_traces
+
+
+def print_analysis_results(analyzer: LogAnalyzer, show_errors: bool = False,
+                           show_traces: bool = False):
+    """Print analysis results."""
+    print("\n" + "="*60)
+    print("📝 LOG ANALYSIS RESULTS")
+    print("="*60)
+
+    print(f"\n📁 File: {analyzer.log_file}")
+    print(f"📊 Total Lines: {len(analyzer.lines):,}")
+
+    # Log levels
+    if analyzer.log_levels:
+        print(f"\n{'='*60}")
+        print("📊 LOG LEVEL DISTRIBUTION:")
+        print(f"{'='*60}")
+
+        level_emoji = {
+            'FATAL': '🔴',
+            'ERROR': '❌',
+            'WARN': '⚠️',
+            'INFO': 'ℹ️',
+            'DEBUG': '🐛'
+        }
+
+        for level, count in analyzer.log_levels.most_common():
+            emoji = level_emoji.get(level, '•')
+            percentage = (count / len(analyzer.lines)) * 100
+            print(f"{emoji} {level:10s}: {count:6,} ({percentage:5.1f}%)")
+
+    # Error patterns
+    if analyzer.error_patterns:
+        print(f"\n{'='*60}")
+        print("🔍 ERROR PATTERNS DETECTED:")
+        print(f"{'='*60}")
+
+        for pattern, count in analyzer.error_patterns.most_common(10):
+            print(f"• {pattern:20s}: {count:,} occurrences")
+
+    # Timestamps
+    if analyzer.timestamps:
+        print(f"\n{'='*60}")
+        print(f"⏰ Timestamps Found: {len(analyzer.timestamps):,}")
+        print(f"   First: {analyzer.timestamps[0]}")
+        print(f"   Last:  {analyzer.timestamps[-1]}")
+
+    # Error lines
+    if show_errors:
+        errors = analyzer.find_error_lines(context=1)
+        if errors:
+            print(f"\n{'='*60}")
+            print(f"❌ ERROR LINES (showing first 10 of {len(errors)}):")
+            print(f"{'='*60}")
+
+            for error in errors[:10]:
+                print(f"\nLine {error['line_number']}:")
+                print(f"  {error['line']}")
+
+    # Stack traces
+    if show_traces:
+        traces = analyzer.find_stack_traces()
+        if traces:
+            print(f"\n{'='*60}")
+            print(f"📚 STACK TRACES FOUND: {len(traces)}")
+            print(f"{'='*60}")
+
+            for i, trace in enumerate(traces[:5], 1):
+                print(f"\nTrace {i} (starting at line {trace['line_start']}):")
+                print(trace['trace'])
+                if i < len(traces):
+                    print("\n" + "-"*60)
+
+    print("\n" + "="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze log files for errors, patterns, and anomalies",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic analysis
+  python3 log_analyzer.py application.log
+
+  # Show error lines with context
+  python3 log_analyzer.py application.log --show-errors
+
+  # Show stack traces
+  python3 log_analyzer.py application.log --show-traces
+
+  # Full analysis
+  python3 log_analyzer.py application.log --show-errors --show-traces
+
+Features:
+  • Log level distribution (ERROR, WARN, INFO, DEBUG, FATAL)
+  • Common error pattern detection
+  • Timestamp extraction
+  • Error line identification with context
+  • Stack trace extraction
+  • Frequency analysis
+        """
+    )
+
+    parser.add_argument('log_file', help='Path to log file')
+    parser.add_argument('--show-errors', action='store_true', help='Show error lines')
+    parser.add_argument('--show-traces', action='store_true', help='Show stack traces')
+    parser.add_argument('--timestamp-pattern', help='Custom regex for timestamp extraction')
+
+    args = parser.parse_args()
+
+    if not Path(args.log_file).exists():
+        print(f"❌ File not found: {args.log_file}")
+        sys.exit(1)
+
+    print(f"🔍 Analyzing log file: {args.log_file}")
+
+    analyzer = LogAnalyzer(args.log_file)
+
+    if not analyzer.parse_file():
+        sys.exit(1)
+
+    # Perform analysis
+    analyzer.analyze_log_levels()
+    analyzer.analyze_error_patterns()
+    analyzer.extract_timestamps(args.timestamp_pattern)
+
+    # Print results
+    print_analysis_results(analyzer, args.show_errors, args.show_traces)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/slo_calculator.py
+++ b/scripts/slo_calculator.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+"""
+Calculate SLO compliance, error budgets, and burn rates.
+Supports availability SLOs and latency SLOs.
+"""
+
+import argparse
+import sys
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    print("⚠️  Warning: 'tabulate' library not found. Install with: pip install tabulate")
+    tabulate = None
+
+
+class SLOCalculator:
+    # SLO targets and allowed downtime per period
+    SLO_TARGETS = {
+        "90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1},  # days
+        "95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05},
+        "99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01},
+        "99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005},
+        "99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001},
+        "99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005},
+        "99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001},
+    }
+
+    def __init__(self, slo_target: float, period_days: int = 30):
+        """
+        Initialize SLO calculator.
+
+        Args:
+            slo_target: SLO target percentage (e.g., 99.9)
+            period_days: Time period in days (default: 30)
+        """
+        self.slo_target = slo_target
+        self.period_days = period_days
+        self.error_budget_minutes = self.calculate_error_budget_minutes()
+
+    def calculate_error_budget_minutes(self) -> float:
+        """Calculate error budget in minutes for the period."""
+        total_minutes = self.period_days * 24 * 60
+        allowed_error_rate = (100 - self.slo_target) / 100
+        return total_minutes * allowed_error_rate
+
+    def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]:
+        """
+        Calculate availability SLO compliance.
+
+        Args:
+            total_requests: Total number of requests
+            failed_requests: Number of failed requests
+
+        Returns:
+            Dict with SLO compliance metrics
+        """
+        if total_requests == 0:
+            return {
+                "error": "No requests in the period",
+                "slo_met": False
+            }
+
+        success_rate = ((total_requests - failed_requests) / total_requests) * 100
+        error_rate = (failed_requests / total_requests) * 100
+
+        # Calculate error budget consumption
+        allowed_failures = total_requests * ((100 - self.slo_target) / 100)
+        error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf')
+        error_budget_remaining = max(0, 100 - error_budget_consumed)
+
+        # Determine if SLO is met
+        slo_met = success_rate >= self.slo_target
+
+        return {
+            "slo_target": self.slo_target,
+            "period_days": self.period_days,
+            "total_requests": total_requests,
+            "failed_requests": failed_requests,
+            "success_requests": total_requests - failed_requests,
+            "success_rate": success_rate,
+            "error_rate": error_rate,
+            "slo_met": slo_met,
+            "error_budget_total": allowed_failures,
+            "error_budget_consumed": error_budget_consumed,
+            "error_budget_remaining": error_budget_remaining,
+            "margin": success_rate - self.slo_target
+        }
+
+    def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]:
+        """
+        Calculate latency SLO compliance.
+
+        Args:
+            total_requests: Total number of requests
+            requests_exceeding_threshold: Number of requests exceeding latency threshold
+
+        Returns:
+            Dict with SLO compliance metrics
+        """
+        if total_requests == 0:
+            return {
+                "error": "No requests in the period",
+                "slo_met": False
+            }
+
+        within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100
+
+        # Calculate error budget consumption
+        allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100)
+        error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf')
+        error_budget_remaining = max(0, 100 - error_budget_consumed)
+
+        slo_met = within_threshold_rate >= self.slo_target
+
+        return {
+            "slo_target": self.slo_target,
+            "period_days": self.period_days,
+            "total_requests": total_requests,
+            "requests_exceeding_threshold": requests_exceeding_threshold,
+            "requests_within_threshold": total_requests - requests_exceeding_threshold,
+            "within_threshold_rate": within_threshold_rate,
+            "slo_met": slo_met,
+            "error_budget_total": allowed_slow_requests,
+            "error_budget_consumed": error_budget_consumed,
+            "error_budget_remaining": error_budget_remaining,
+            "margin": within_threshold_rate - self.slo_target
+        }
+
+    def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]:
+        """
+        Calculate error budget burn rate.
+
+        Args:
+            errors_in_window: Number of errors in the time window
+            requests_in_window: Total requests in the time window
+            window_hours: Size of the time window in hours
+
+        Returns:
+            Dict with burn rate metrics
+        """
+        if requests_in_window == 0:
+            return {"error": "No requests in window"}
+
+        # Calculate actual error rate in this window
+        actual_error_rate = (errors_in_window / requests_in_window) * 100
+
+        # Calculate allowed error rate for SLO
+        allowed_error_rate = 100 - self.slo_target
+
+        # Burn rate = actual error rate / allowed error rate
+        burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf')
+
+        # Calculate time to exhaustion
+        if burn_rate > 0:
+            error_budget_hours = self.error_budget_minutes / 60
+            hours_to_exhaustion = error_budget_hours / burn_rate
+        else:
+            hours_to_exhaustion = float('inf')
+
+        # Determine severity
+        if burn_rate >= 14.4:  # 1 hour window, burns budget in 2 days
+            severity = "critical"
+        elif burn_rate >= 6:  # 6 hour window, burns budget in 5 days
+            severity = "warning"
+        elif burn_rate >= 1:
+            severity = "elevated"
+        else:
+            severity = "normal"
+
+        return {
+            "window_hours": window_hours,
+            "requests_in_window": requests_in_window,
+            "errors_in_window": errors_in_window,
+            "actual_error_rate": actual_error_rate,
+            "allowed_error_rate": allowed_error_rate,
+            "burn_rate": burn_rate,
+            "hours_to_exhaustion": hours_to_exhaustion,
+            "severity": severity
+        }
+
+    @staticmethod
+    def print_slo_table():
+        """Print table of common SLO targets and allowed downtime."""
+        if not tabulate:
+            print("Install tabulate for formatted output: pip install tabulate")
+            return
+
+        print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME")
+        print("="*60)
+
+        headers = ["SLO", "Year", "Month", "Week", "Day"]
+        rows = []
+
+        for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True):
+            row = [
+                f"{slo}%",
+                f"{downtimes['year']:.2f} days",
+                f"{downtimes['month']:.2f} days",
+                f"{downtimes['week']:.2f} days",
+                f"{downtimes['day']:.2f} days"
+            ]
+            rows.append(row)
+
+        print(tabulate(rows, headers=headers, tablefmt="grid"))
+
+
+def print_availability_results(results: Dict[str, Any]):
+    """Print availability SLO results."""
+    print("\n" + "="*60)
+    print("📊 AVAILABILITY SLO COMPLIANCE")
+    print("="*60)
+
+    if "error" in results:
+        print(f"\n❌ Error: {results['error']}")
+        return
+
+    status_emoji = "✅" if results['slo_met'] else "❌"
+    print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}")
+    print(f"   Target: {results['slo_target']}%")
+    print(f"   Actual: {results['success_rate']:.3f}%")
+    print(f"   Margin: {results['margin']:+.3f}%")
+
+    print(f"\n📈 Request Statistics:")
+    print(f"   Total Requests: {results['total_requests']:,}")
+    print(f"   Successful: {results['success_requests']:,}")
+    print(f"   Failed: {results['failed_requests']:,}")
+    print(f"   Error Rate: {results['error_rate']:.3f}%")
+
+    print(f"\n💰 Error Budget:")
+    budget_emoji = "✅" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else "❌"
+    print(f"   {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%")
+    print(f"   Consumed: {results['error_budget_consumed']:.1f}%")
+    print(f"   Allowed Failures: {results['error_budget_total']:.0f}")
+
+    print("\n" + "="*60)
+
+
+def print_burn_rate_results(results: Dict[str, Any]):
+    """Print burn rate results."""
+    print("\n" + "="*60)
+    print("🔥 ERROR BUDGET BURN RATE")
+    print("="*60)
+
+    if "error" in results:
+        print(f"\n❌ Error: {results['error']}")
+        return
+
+    severity_emoji = {
+        "critical": "🔴",
+        "warning": "🟡",
+        "elevated": "🟠",
+        "normal": "🟢"
+    }
+
+    print(f"\n{severity_emoji.get(results['severity'], '❓')} Severity: {results['severity'].upper()}")
+    print(f"   Burn Rate: {results['burn_rate']:.2f}x")
+    print(f"   Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)")
+
+    print(f"\n📊 Window Statistics:")
+    print(f"   Window: {results['window_hours']} hours")
+    print(f"   Requests: {results['requests_in_window']:,}")
+    print(f"   Errors: {results['errors_in_window']:,}")
+    print(f"   Actual Error Rate: {results['actual_error_rate']:.3f}%")
+    print(f"   Allowed Error Rate: {results['allowed_error_rate']:.3f}%")
+
+    print("\n" + "="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Calculate SLO compliance and error budgets",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Show SLO reference table
+  python3 slo_calculator.py --table
+
+  # Calculate availability SLO
+  python3 slo_calculator.py availability \\
+    --slo 99.9 \\
+    --total-requests 1000000 \\
+    --failed-requests 1500 \\
+    --period-days 30
+
+  # Calculate latency SLO
+  python3 slo_calculator.py latency \\
+    --slo 99.5 \\
+    --total-requests 500000 \\
+    --slow-requests 3000 \\
+    --period-days 7
+
+  # Calculate burn rate
+  python3 slo_calculator.py burn-rate \\
+    --slo 99.9 \\
+    --errors 50 \\
+    --requests 10000 \\
+    --window-hours 1
+        """
+    )
+
+    parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'],
+                       help='Calculation mode')
+    parser.add_argument('--table', action='store_true', help='Show SLO reference table')
+    parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)')
+    parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)')
+
+    # Availability SLO arguments
+    parser.add_argument('--total-requests', type=int, help='Total number of requests')
+    parser.add_argument('--failed-requests', type=int, help='Number of failed requests')
+
+    # Latency SLO arguments
+    parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold')
+
+    # Burn rate arguments
+    parser.add_argument('--errors', type=int, help='Number of errors in window')
+    parser.add_argument('--requests', type=int, help='Number of requests in window')
+    parser.add_argument('--window-hours', type=float, help='Window size in hours')
+
+    args = parser.parse_args()
+
+    # Show table if requested
+    if args.table:
+        SLOCalculator.print_slo_table()
+        return
+
+    if not args.mode:
+        parser.print_help()
+        return
+
+    if not args.slo:
+        print("❌ --slo required")
+        sys.exit(1)
+
+    calculator = SLOCalculator(args.slo, args.period_days)
+
+    if args.mode == 'availability':
+        if not args.total_requests or args.failed_requests is None:
+            print("❌ --total-requests and --failed-requests required")
+            sys.exit(1)
+
+        results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests)
+        print_availability_results(results)
+
+    elif args.mode == 'latency':
+        if not args.total_requests or args.slow_requests is None:
+            print("❌ --total-requests and --slow-requests required")
+            sys.exit(1)
+
+        results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests)
+        print_availability_results(results)  # Same format
+
+    elif args.mode == 'burn-rate':
+        if not all([args.errors is not None, args.requests, args.window_hours]):
+            print("❌ --errors, --requests, and --window-hours required")
+            sys.exit(1)
+
+        results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours)
+        print_burn_rate_results(results)
+
+
+if __name__ == "__main__":
+    main()