Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions

View File

@@ -0,0 +1,315 @@
#!/usr/bin/env python3
"""
Audit Prometheus alert rules against best practices.
Checks for: alert naming, severity labels, runbook links, expression quality.
"""
import argparse
import sys
import os
import re
from typing import Dict, List, Any
from pathlib import Path
try:
import yaml
except ImportError:
print("⚠️ Warning: 'PyYAML' library not found. Install with: pip install pyyaml")
sys.exit(1)
class AlertQualityChecker:
def __init__(self):
self.issues = []
self.warnings = []
self.recommendations = []
def check_alert_name(self, alert_name: str) -> List[str]:
"""Check alert naming conventions."""
issues = []
# Should be PascalCase or camelCase
if not re.match(r'^[A-Z][a-zA-Z0-9]*$', alert_name):
issues.append(f"Alert name '{alert_name}' should use PascalCase (e.g., HighCPUUsage)")
# Should be descriptive
if len(alert_name) < 5:
issues.append(f"Alert name '{alert_name}' is too short, use descriptive names")
# Avoid generic names
generic_names = ['Alert', 'Test', 'Warning', 'Error']
if alert_name in generic_names:
issues.append(f"Alert name '{alert_name}' is too generic")
return issues
def check_labels(self, alert: Dict[str, Any]) -> List[str]:
"""Check required and recommended labels."""
issues = []
labels = alert.get('labels', {})
# Required labels
if 'severity' not in labels:
issues.append("Missing required 'severity' label (critical/warning/info)")
elif labels['severity'] not in ['critical', 'warning', 'info']:
issues.append(f"Severity '{labels['severity']}' should be one of: critical, warning, info")
# Recommended labels
if 'team' not in labels:
self.recommendations.append("Consider adding 'team' label for routing")
if 'component' not in labels and 'service' not in labels:
self.recommendations.append("Consider adding 'component' or 'service' label")
return issues
def check_annotations(self, alert: Dict[str, Any]) -> List[str]:
"""Check annotations quality."""
issues = []
annotations = alert.get('annotations', {})
# Required annotations
if 'summary' not in annotations:
issues.append("Missing 'summary' annotation")
elif len(annotations['summary']) < 10:
issues.append("Summary annotation is too short, provide clear description")
if 'description' not in annotations:
issues.append("Missing 'description' annotation")
# Runbook
if 'runbook_url' not in annotations and 'runbook' not in annotations:
self.recommendations.append("Consider adding 'runbook_url' for incident response")
# Check for templating
if 'summary' in annotations:
if '{{ $value }}' not in annotations['summary'] and '{{' not in annotations['summary']:
self.recommendations.append("Consider using template variables in summary (e.g., {{ $value }})")
return issues
def check_expression(self, expr: str, alert_name: str) -> List[str]:
"""Check PromQL expression quality."""
issues = []
# Should have a threshold
if '>' not in expr and '<' not in expr and '==' not in expr and '!=' not in expr:
issues.append("Expression should include a comparison operator")
# Should use rate() for counters
if '_total' in expr and 'rate(' not in expr and 'increase(' not in expr:
self.recommendations.append("Consider using rate() or increase() for counter metrics (*_total)")
# Avoid instant queries without aggregation
if not any(agg in expr for agg in ['sum(', 'avg(', 'min(', 'max(', 'count(']):
if expr.count('{') > 1: # Multiple metrics without aggregation
self.recommendations.append("Consider aggregating metrics with sum(), avg(), etc.")
# Check for proper time windows
if '[' not in expr and 'rate(' in expr:
issues.append("rate() requires a time window (e.g., rate(metric[5m]))")
return issues
def check_for_duration(self, rule: Dict[str, Any]) -> List[str]:
"""Check for 'for' clause to prevent flapping."""
issues = []
severity = rule.get('labels', {}).get('severity', 'unknown')
if 'for' not in rule:
if severity == 'critical':
issues.append("Critical alerts should have 'for' clause to prevent flapping")
else:
self.warnings.append("Consider adding 'for' clause to prevent alert flapping")
else:
# Parse duration
duration = rule['for']
if severity == 'critical' and any(x in duration for x in ['0s', '30s', '1m']):
self.warnings.append(f"'for' duration ({duration}) might be too short for critical alerts")
return issues
def check_alert_rule(self, rule: Dict[str, Any]) -> Dict[str, Any]:
"""Check a single alert rule."""
alert_name = rule.get('alert', 'Unknown')
issues = []
# Check alert name
issues.extend(self.check_alert_name(alert_name))
# Check expression
if 'expr' not in rule:
issues.append("Missing 'expr' field")
else:
issues.extend(self.check_expression(rule['expr'], alert_name))
# Check labels
issues.extend(self.check_labels(rule))
# Check annotations
issues.extend(self.check_annotations(rule))
# Check for duration
issues.extend(self.check_for_duration(rule))
return {
"alert": alert_name,
"issues": issues,
"severity": rule.get('labels', {}).get('severity', 'unknown')
}
def analyze_file(self, filepath: str) -> Dict[str, Any]:
"""Analyze a Prometheus rules file."""
try:
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
if not data:
return {"error": "Empty or invalid YAML file"}
results = []
groups = data.get('groups', [])
for group in groups:
group_name = group.get('name', 'Unknown')
rules = group.get('rules', [])
for rule in rules:
# Only check alerting rules, not recording rules
if 'alert' in rule:
result = self.check_alert_rule(rule)
result['group'] = group_name
results.append(result)
return {
"file": filepath,
"groups": len(groups),
"alerts_checked": len(results),
"results": results
}
except Exception as e:
return {"error": f"Failed to parse file: {e}"}
def print_results(analysis: Dict[str, Any], checker: AlertQualityChecker):
"""Pretty print analysis results."""
print("\n" + "="*60)
print("🚨 ALERT QUALITY CHECK RESULTS")
print("="*60)
if "error" in analysis:
print(f"\n❌ Error: {analysis['error']}")
return
print(f"\n📁 File: {analysis['file']}")
print(f"📊 Groups: {analysis['groups']}")
print(f"🔔 Alerts Checked: {analysis['alerts_checked']}")
# Count issues by severity
critical_count = 0
warning_count = 0
for result in analysis['results']:
if result['issues']:
critical_count += 1
print(f"\n{'='*60}")
print(f"📈 Summary:")
print(f" ❌ Alerts with Issues: {critical_count}")
print(f" ⚠️ Warnings: {len(checker.warnings)}")
print(f" 💡 Recommendations: {len(checker.recommendations)}")
# Print detailed results
if critical_count > 0:
print(f"\n{'='*60}")
print("❌ ALERTS WITH ISSUES:")
print(f"{'='*60}")
for result in analysis['results']:
if result['issues']:
print(f"\n🔔 Alert: {result['alert']} (Group: {result['group']})")
print(f" Severity: {result['severity']}")
print(" Issues:")
for issue in result['issues']:
print(f"{issue}")
# Print warnings
if checker.warnings:
print(f"\n{'='*60}")
print("⚠️ WARNINGS:")
print(f"{'='*60}")
for warning in set(checker.warnings): # Remove duplicates
print(f"{warning}")
# Print recommendations
if checker.recommendations:
print(f"\n{'='*60}")
print("💡 RECOMMENDATIONS:")
print(f"{'='*60}")
for rec in list(set(checker.recommendations))[:10]: # Top 10 unique recommendations
print(f"{rec}")
# Overall score
total_alerts = analysis['alerts_checked']
if total_alerts > 0:
quality_score = ((total_alerts - critical_count) / total_alerts) * 100
print(f"\n{'='*60}")
print(f"📊 Quality Score: {quality_score:.1f}% ({total_alerts - critical_count}/{total_alerts} alerts passing)")
print(f"{'='*60}\n")
def main():
parser = argparse.ArgumentParser(
description="Audit Prometheus alert rules for quality and best practices",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Check a single file
python3 alert_quality_checker.py alerts.yml
# Check all YAML files in a directory
python3 alert_quality_checker.py /path/to/prometheus/rules/
Best Practices Checked:
✓ Alert naming conventions (PascalCase, descriptive)
✓ Required labels (severity)
✓ Required annotations (summary, description)
✓ Runbook URL presence
✓ PromQL expression quality
'for' clause to prevent flapping
✓ Template variable usage
"""
)
parser.add_argument('path', help='Path to alert rules file or directory')
parser.add_argument('--verbose', action='store_true', help='Show all recommendations')
args = parser.parse_args()
checker = AlertQualityChecker()
# Check if path is file or directory
path = Path(args.path)
if path.is_file():
files = [str(path)]
elif path.is_dir():
files = [str(f) for f in path.rglob('*.yml')] + [str(f) for f in path.rglob('*.yaml')]
else:
print(f"❌ Path not found: {args.path}")
sys.exit(1)
if not files:
print(f"❌ No YAML files found in: {args.path}")
sys.exit(1)
print(f"🔍 Checking {len(files)} file(s)...")
for filepath in files:
analysis = checker.analyze_file(filepath)
print_results(analysis, checker)
if __name__ == "__main__":
main()

279
scripts/analyze_metrics.py Normal file
View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
Analyze metrics from Prometheus or CloudWatch and detect anomalies.
Supports: rate of change analysis, spike detection, trend analysis.
"""
import argparse
import sys
import json
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
import statistics
try:
import requests
except ImportError:
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
sys.exit(1)
try:
import boto3
except ImportError:
boto3 = None
class MetricAnalyzer:
def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"):
self.source = source
self.endpoint = endpoint
self.region = region
if source == "cloudwatch" and boto3:
self.cloudwatch = boto3.client('cloudwatch', region_name=region)
elif source == "cloudwatch" and not boto3:
print("⚠️ boto3 not installed. Install with: pip install boto3")
sys.exit(1)
def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]:
"""Query Prometheus for metric data."""
if not self.endpoint:
print("❌ Prometheus endpoint required")
sys.exit(1)
try:
# Query range for last N hours
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
params = {
'query': query,
'start': start_time.timestamp(),
'end': end_time.timestamp(),
'step': '5m' # 5-minute resolution
}
response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30)
response.raise_for_status()
data = response.json()
if data['status'] != 'success':
print(f"❌ Prometheus query failed: {data}")
return []
return data['data']['result']
except Exception as e:
print(f"❌ Error querying Prometheus: {e}")
return []
def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str],
hours: int = 24, stat: str = "Average") -> List[Dict]:
"""Query CloudWatch for metric data."""
try:
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
response = self.cloudwatch.get_metric_statistics(
Namespace=namespace,
MetricName=metric_name,
Dimensions=dimensions_list,
StartTime=start_time,
EndTime=end_time,
Period=300, # 5-minute intervals
Statistics=[stat]
)
return sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
except Exception as e:
print(f"❌ Error querying CloudWatch: {e}")
return []
def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]:
"""Detect anomalies using standard deviation method."""
if len(values) < 10:
return {
"anomalies_detected": False,
"message": "Insufficient data points for anomaly detection"
}
mean = statistics.mean(values)
stdev = statistics.stdev(values)
threshold_upper = mean + (sensitivity * stdev)
threshold_lower = mean - (sensitivity * stdev)
anomalies = []
for i, value in enumerate(values):
if value > threshold_upper or value < threshold_lower:
anomalies.append({
"index": i,
"value": value,
"deviation": abs(value - mean) / stdev if stdev > 0 else 0
})
return {
"anomalies_detected": len(anomalies) > 0,
"count": len(anomalies),
"anomalies": anomalies,
"stats": {
"mean": mean,
"stdev": stdev,
"threshold_upper": threshold_upper,
"threshold_lower": threshold_lower,
"total_points": len(values)
}
}
def analyze_trend(self, values: List[float]) -> Dict[str, Any]:
"""Analyze trend using simple linear regression."""
if len(values) < 2:
return {"trend": "unknown", "message": "Insufficient data"}
n = len(values)
x = list(range(n))
x_mean = sum(x) / n
y_mean = sum(values) / n
numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
if denominator == 0:
return {"trend": "flat", "slope": 0}
slope = numerator / denominator
# Determine trend direction
if abs(slope) < 0.01 * y_mean: # Less than 1% change per interval
trend = "stable"
elif slope > 0:
trend = "increasing"
else:
trend = "decreasing"
return {
"trend": trend,
"slope": slope,
"rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0
}
def print_results(results: Dict[str, Any]):
"""Pretty print analysis results."""
print("\n" + "="*60)
print("📊 METRIC ANALYSIS RESULTS")
print("="*60)
if "error" in results:
print(f"\n❌ Error: {results['error']}")
return
print(f"\n📈 Data Points: {results.get('data_points', 0)}")
# Trend analysis
if "trend" in results:
trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "")
print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}")
if "rate_of_change" in results["trend"]:
print(f" Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval")
# Anomaly detection
if "anomalies" in results:
anomaly_data = results["anomalies"]
if anomaly_data["anomalies_detected"]:
print(f"\n⚠️ ANOMALIES DETECTED: {anomaly_data['count']}")
print(f" Mean: {anomaly_data['stats']['mean']:.2f}")
print(f" Std Dev: {anomaly_data['stats']['stdev']:.2f}")
print(f" Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]")
print("\n Top Anomalies:")
for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]:
print(f" • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)")
else:
print("\n✅ No anomalies detected")
print("\n" + "="*60)
def main():
parser = argparse.ArgumentParser(
description="Analyze metrics from Prometheus or CloudWatch",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Prometheus: Analyze request rate
python3 analyze_metrics.py prometheus \\
--endpoint http://localhost:9090 \\
--query 'rate(http_requests_total[5m])' \\
--hours 24
# CloudWatch: Analyze CPU utilization
python3 analyze_metrics.py cloudwatch \\
--namespace AWS/EC2 \\
--metric CPUUtilization \\
--dimensions InstanceId=i-1234567890abcdef0 \\
--hours 48
"""
)
parser.add_argument('source', choices=['prometheus', 'cloudwatch'],
help='Metric source')
parser.add_argument('--endpoint', help='Prometheus endpoint URL')
parser.add_argument('--query', help='PromQL query')
parser.add_argument('--namespace', help='CloudWatch namespace')
parser.add_argument('--metric', help='CloudWatch metric name')
parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)')
parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)')
parser.add_argument('--sensitivity', type=float, default=2.0,
help='Anomaly detection sensitivity (std deviations, default: 2.0)')
parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)')
args = parser.parse_args()
analyzer = MetricAnalyzer(args.source, args.endpoint, args.region)
# Query metrics
if args.source == 'prometheus':
if not args.query:
print("❌ --query required for Prometheus")
sys.exit(1)
print(f"🔍 Querying Prometheus: {args.query}")
results = analyzer.query_prometheus(args.query, args.hours)
if not results:
print("❌ No data returned")
sys.exit(1)
# Extract values from first result series
values = [float(v[1]) for v in results[0].get('values', [])]
elif args.source == 'cloudwatch':
if not all([args.namespace, args.metric, args.dimensions]):
print("❌ --namespace, --metric, and --dimensions required for CloudWatch")
sys.exit(1)
dims = dict(item.split('=') for item in args.dimensions.split(','))
print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}")
results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours)
if not results:
print("❌ No data returned")
sys.exit(1)
values = [point['Average'] for point in results]
# Analyze metrics
analysis_results = {
"data_points": len(values),
"trend": analyzer.analyze_trend(values),
"anomalies": analyzer.detect_anomalies(values, args.sensitivity)
}
print_results(analysis_results)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,395 @@
#!/usr/bin/env python3
"""
Generate Grafana dashboards from templates.
Supports: web applications, Kubernetes, databases, Redis, and custom metrics.
"""
import argparse
import sys
import json
from typing import Dict, List, Any, Optional
from pathlib import Path
class DashboardGenerator:
def __init__(self, title: str, datasource: str = "Prometheus"):
self.title = title
self.datasource = datasource
self.dashboard = self._create_base_dashboard()
self.panel_id = 1
self.row_y = 0
def _create_base_dashboard(self) -> Dict[str, Any]:
"""Create base dashboard structure."""
return {
"dashboard": {
"title": self.title,
"tags": [],
"timezone": "browser",
"schemaVersion": 16,
"version": 0,
"refresh": "30s",
"panels": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
}
},
"overwrite": True
}
def add_variable(self, name: str, label: str, query: str):
"""Add a template variable."""
variable = {
"name": name,
"label": label,
"type": "query",
"datasource": self.datasource,
"query": query,
"refresh": 1,
"regex": "",
"multi": False,
"includeAll": False
}
self.dashboard["dashboard"]["templating"]["list"].append(variable)
def add_row(self, title: str):
"""Add a row panel."""
panel = {
"id": self.panel_id,
"type": "row",
"title": title,
"collapsed": False,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": self.row_y}
}
self.dashboard["dashboard"]["panels"].append(panel)
self.panel_id += 1
self.row_y += 1
def add_graph(self, title: str, targets: List[Dict[str, str]], unit: str = "short",
width: int = 12, height: int = 8):
"""Add a graph panel."""
panel = {
"id": self.panel_id,
"type": "graph",
"title": title,
"datasource": self.datasource,
"targets": [
{
"expr": target["query"],
"legendFormat": target.get("legend", ""),
"refId": chr(65 + i) # A, B, C, etc.
}
for i, target in enumerate(targets)
],
"gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y},
"yaxes": [
{"format": unit, "label": None, "show": True},
{"format": "short", "label": None, "show": True}
],
"lines": True,
"fill": 1,
"linewidth": 2,
"legend": {
"show": True,
"alignAsTable": True,
"avg": True,
"current": True,
"max": True,
"min": False,
"total": False,
"values": True
}
}
self.dashboard["dashboard"]["panels"].append(panel)
self.panel_id += 1
self.row_y += height
def add_stat(self, title: str, query: str, unit: str = "short",
width: int = 6, height: int = 4):
"""Add a stat panel (single value)."""
panel = {
"id": self.panel_id,
"type": "stat",
"title": title,
"datasource": self.datasource,
"targets": [
{
"expr": query,
"refId": "A"
}
],
"gridPos": {"h": height, "w": width, "x": 0, "y": self.row_y},
"options": {
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"]
}
},
"fieldConfig": {
"defaults": {
"unit": unit,
"thresholds": {
"mode": "absolute",
"steps": [
{"value": None, "color": "green"},
{"value": 80, "color": "red"}
]
}
}
}
}
self.dashboard["dashboard"]["panels"].append(panel)
self.panel_id += 1
def generate_webapp_dashboard(self, service: str):
"""Generate dashboard for web application."""
self.add_variable("service", "Service", f"label_values({service}_http_requests_total, service)")
# Request metrics
self.add_row("Request Metrics")
self.add_graph(
"Request Rate",
[{"query": f'sum(rate({service}_http_requests_total[5m])) by (status)', "legend": "{{status}}"}],
unit="reqps",
width=12
)
self.add_graph(
"Request Latency (p50, p95, p99)",
[
{"query": f'histogram_quantile(0.50, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p50"},
{"query": f'histogram_quantile(0.95, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p95"},
{"query": f'histogram_quantile(0.99, sum(rate({service}_http_request_duration_seconds_bucket[5m])) by (le))', "legend": "p99"}
],
unit="s",
width=12
)
# Error rate
self.add_row("Errors")
self.add_graph(
"Error Rate (%)",
[{"query": f'sum(rate({service}_http_requests_total{{status=~"5.."}}[5m])) / sum(rate({service}_http_requests_total[5m])) * 100', "legend": "Error Rate"}],
unit="percent",
width=12
)
# Resource usage
self.add_row("Resource Usage")
self.add_graph(
"CPU Usage",
[{"query": f'sum(rate(process_cpu_seconds_total{{job="{service}"}}[5m])) * 100', "legend": "CPU %"}],
unit="percent",
width=12
)
self.add_graph(
"Memory Usage",
[{"query": f'process_resident_memory_bytes{{job="{service}"}}', "legend": "Memory"}],
unit="bytes",
width=12
)
def generate_kubernetes_dashboard(self, namespace: str):
"""Generate dashboard for Kubernetes cluster."""
self.add_variable("namespace", "Namespace", f"label_values(kube_pod_info, namespace)")
# Cluster overview
self.add_row("Cluster Overview")
self.add_stat("Total Pods", f'count(kube_pod_info{{namespace="{namespace}"}})', width=6)
self.add_stat("Running Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Running"}})', width=6)
self.add_stat("Pending Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Pending"}})', width=6)
self.add_stat("Failed Pods", f'count(kube_pod_status_phase{{namespace="{namespace}", phase="Failed"}})', width=6)
# Resource usage
self.add_row("Resource Usage")
self.add_graph(
"CPU Usage by Pod",
[{"query": f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "{{pod}}"}],
unit="percent",
width=12
)
self.add_graph(
"Memory Usage by Pod",
[{"query": f'sum(container_memory_usage_bytes{{namespace="{namespace}"}}) by (pod)', "legend": "{{pod}}"}],
unit="bytes",
width=12
)
# Network
self.add_row("Network")
self.add_graph(
"Network I/O",
[
{"query": f'sum(rate(container_network_receive_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Receive - {{pod}}"},
{"query": f'sum(rate(container_network_transmit_bytes_total{{namespace="{namespace}"}}[5m])) by (pod)', "legend": "Transmit - {{pod}}"}
],
unit="Bps",
width=12
)
def generate_database_dashboard(self, db_type: str, instance: str):
"""Generate dashboard for database (postgres/mysql)."""
if db_type == "postgres":
self._generate_postgres_dashboard(instance)
elif db_type == "mysql":
self._generate_mysql_dashboard(instance)
def _generate_postgres_dashboard(self, instance: str):
"""Generate PostgreSQL dashboard."""
self.add_row("PostgreSQL Metrics")
self.add_graph(
"Connections",
[
{"query": f'pg_stat_database_numbackends{{instance="{instance}"}}', "legend": "{{datname}}"}
],
unit="short",
width=12
)
self.add_graph(
"Transactions per Second",
[
{"query": f'rate(pg_stat_database_xact_commit{{instance="{instance}"}}[5m])', "legend": "Commits"},
{"query": f'rate(pg_stat_database_xact_rollback{{instance="{instance}"}}[5m])', "legend": "Rollbacks"}
],
unit="tps",
width=12
)
self.add_graph(
"Query Duration (p95)",
[
{"query": f'histogram_quantile(0.95, rate(pg_stat_statements_total_time_bucket{{instance="{instance}"}}[5m]))', "legend": "p95"}
],
unit="ms",
width=12
)
def _generate_mysql_dashboard(self, instance: str):
"""Generate MySQL dashboard."""
self.add_row("MySQL Metrics")
self.add_graph(
"Connections",
[
{"query": f'mysql_global_status_threads_connected{{instance="{instance}"}}', "legend": "Connected"},
{"query": f'mysql_global_status_threads_running{{instance="{instance}"}}', "legend": "Running"}
],
unit="short",
width=12
)
self.add_graph(
"Queries per Second",
[
{"query": f'rate(mysql_global_status_queries{{instance="{instance}"}}[5m])', "legend": "Queries"}
],
unit="qps",
width=12
)
def save(self, output_file: str):
"""Save dashboard to file."""
try:
with open(output_file, 'w') as f:
json.dump(self.dashboard, f, indent=2)
return True
except Exception as e:
print(f"❌ Error saving dashboard: {e}")
return False
def main():
parser = argparse.ArgumentParser(
description="Generate Grafana dashboards from templates",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Web application dashboard
python3 dashboard_generator.py webapp \\
--title "My API Dashboard" \\
--service my_api \\
--output dashboard.json
# Kubernetes dashboard
python3 dashboard_generator.py kubernetes \\
--title "K8s Namespace" \\
--namespace production \\
--output k8s-dashboard.json
# Database dashboard
python3 dashboard_generator.py database \\
--title "PostgreSQL" \\
--db-type postgres \\
--instance db.example.com:5432 \\
--output db-dashboard.json
"""
)
parser.add_argument('type', choices=['webapp', 'kubernetes', 'database'],
help='Dashboard type')
parser.add_argument('--title', required=True, help='Dashboard title')
parser.add_argument('--output', required=True, help='Output file path')
parser.add_argument('--datasource', default='Prometheus', help='Data source name')
# Web app specific
parser.add_argument('--service', help='Service name (for webapp)')
# Kubernetes specific
parser.add_argument('--namespace', help='Kubernetes namespace')
# Database specific
parser.add_argument('--db-type', choices=['postgres', 'mysql'], help='Database type')
parser.add_argument('--instance', help='Database instance')
args = parser.parse_args()
print(f"🎨 Generating {args.type} dashboard: {args.title}")
generator = DashboardGenerator(args.title, args.datasource)
if args.type == 'webapp':
if not args.service:
print("❌ --service required for webapp dashboard")
sys.exit(1)
generator.generate_webapp_dashboard(args.service)
elif args.type == 'kubernetes':
if not args.namespace:
print("❌ --namespace required for kubernetes dashboard")
sys.exit(1)
generator.generate_kubernetes_dashboard(args.namespace)
elif args.type == 'database':
if not args.db_type or not args.instance:
print("❌ --db-type and --instance required for database dashboard")
sys.exit(1)
generator.generate_database_dashboard(args.db_type, args.instance)
if generator.save(args.output):
print(f"✅ Dashboard saved to: {args.output}")
print(f"\n📝 Import to Grafana:")
print(f" 1. Go to Grafana → Dashboards → Import")
print(f" 2. Upload {args.output}")
print(f" 3. Select datasource and save")
else:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,477 @@
#!/usr/bin/env python3
"""
Analyze Datadog usage and identify cost optimization opportunities.
Helps find waste in custom metrics, logs, APM, and infrastructure monitoring.
"""
import argparse
import sys
import os
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from collections import defaultdict
try:
import requests
except ImportError:
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
sys.exit(1)
try:
from tabulate import tabulate
except ImportError:
tabulate = None
class DatadogCostAnalyzer:
# Pricing (as of 2024-2025)
PRICING = {
'infrastructure_pro': 15, # per host per month
'infrastructure_enterprise': 23,
'custom_metric': 0.01, # per metric per month (first 100 free per host)
'log_ingestion': 0.10, # per GB ingested per month
'apm_host': 31, # APM Pro per host per month
'apm_span': 1.70, # per million indexed spans
}
def __init__(self, api_key: str, app_key: str, site: str = "datadoghq.com"):
self.api_key = api_key
self.app_key = app_key
self.site = site
self.base_url = f"https://api.{site}"
self.headers = {
'DD-API-KEY': api_key,
'DD-APPLICATION-KEY': app_key,
'Content-Type': 'application/json'
}
def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
"""Make API request to Datadog."""
try:
url = f"{self.base_url}{endpoint}"
response = requests.get(url, headers=self.headers, params=params, timeout=30)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"❌ API Error: {e}")
return {}
def get_usage_metrics(self, start_date: str, end_date: str) -> Dict[str, Any]:
"""Get usage metrics for specified date range."""
endpoint = "/api/v1/usage/summary"
params = {
'start_month': start_date,
'end_month': end_date,
'include_org_details': 'true'
}
data = self._make_request(endpoint, params)
return data.get('usage', [])
def get_custom_metrics(self) -> Dict[str, Any]:
"""Get custom metrics usage and identify high-cardinality metrics."""
endpoint = "/api/v1/usage/timeseries"
# Get last 30 days
end_date = datetime.now()
start_date = end_date - timedelta(days=30)
params = {
'start_hr': int(start_date.timestamp()),
'end_hr': int(end_date.timestamp())
}
data = self._make_request(endpoint, params)
if not data:
return {'metrics': [], 'total_count': 0}
# Extract custom metrics info
usage_data = data.get('usage', [])
metrics_summary = {
'total_custom_metrics': 0,
'avg_custom_metrics': 0,
'billable_metrics': 0
}
for day in usage_data:
if 'timeseries' in day:
for ts in day['timeseries']:
if ts.get('metric_category') == 'custom':
metrics_summary['total_custom_metrics'] = max(
metrics_summary['total_custom_metrics'],
ts.get('num_custom_timeseries', 0)
)
# Calculate billable (first 100 free)
metrics_summary['billable_metrics'] = max(0, metrics_summary['total_custom_metrics'] - 100)
return metrics_summary
def get_infrastructure_hosts(self) -> Dict[str, Any]:
"""Get infrastructure host count and breakdown."""
endpoint = "/api/v1/usage/hosts"
end_date = datetime.now()
start_date = end_date - timedelta(days=30)
params = {
'start_hr': int(start_date.timestamp()),
'end_hr': int(end_date.timestamp())
}
data = self._make_request(endpoint, params)
if not data:
return {'total_hosts': 0}
usage = data.get('usage', [])
host_summary = {
'total_hosts': 0,
'agent_hosts': 0,
'aws_hosts': 0,
'azure_hosts': 0,
'gcp_hosts': 0,
'container_count': 0
}
for day in usage:
host_summary['total_hosts'] = max(host_summary['total_hosts'], day.get('host_count', 0))
host_summary['agent_hosts'] = max(host_summary['agent_hosts'], day.get('agent_host_count', 0))
host_summary['aws_hosts'] = max(host_summary['aws_hosts'], day.get('aws_host_count', 0))
host_summary['azure_hosts'] = max(host_summary['azure_hosts'], day.get('azure_host_count', 0))
host_summary['gcp_hosts'] = max(host_summary['gcp_hosts'], day.get('gcp_host_count', 0))
host_summary['container_count'] = max(host_summary['container_count'], day.get('container_count', 0))
return host_summary
def get_log_usage(self) -> Dict[str, Any]:
"""Get log ingestion and retention usage."""
endpoint = "/api/v1/usage/logs"
end_date = datetime.now()
start_date = end_date - timedelta(days=30)
params = {
'start_hr': int(start_date.timestamp()),
'end_hr': int(end_date.timestamp())
}
data = self._make_request(endpoint, params)
if not data:
return {'total_gb': 0, 'daily_avg_gb': 0}
usage = data.get('usage', [])
total_ingested = 0
days_count = len(usage)
for day in usage:
total_ingested += day.get('ingested_events_bytes', 0)
total_gb = total_ingested / (1024**3) # Convert to GB
daily_avg_gb = total_gb / max(days_count, 1)
return {
'total_gb': total_gb,
'daily_avg_gb': daily_avg_gb,
'monthly_projected_gb': daily_avg_gb * 30
}
def get_unused_monitors(self) -> List[Dict[str, Any]]:
"""Find monitors that haven't alerted in 30+ days."""
endpoint = "/api/v1/monitor"
data = self._make_request(endpoint)
if not data:
return []
monitors = data if isinstance(data, list) else []
unused = []
now = datetime.now()
for monitor in monitors:
# Check if monitor has triggered recently
overall_state = monitor.get('overall_state')
modified = monitor.get('modified', '')
# If monitor has been in OK state and not modified in 30+ days
try:
if modified:
mod_date = datetime.fromisoformat(modified.replace('Z', '+00:00'))
days_since_modified = (now - mod_date.replace(tzinfo=None)).days
if days_since_modified > 30 and overall_state in ['OK', 'No Data']:
unused.append({
'name': monitor.get('name', 'Unknown'),
'id': monitor.get('id'),
'days_since_modified': days_since_modified,
'state': overall_state
})
except:
pass
return unused
def calculate_costs(self, usage_data: Dict[str, Any]) -> Dict[str, float]:
"""Calculate estimated monthly costs."""
costs = {
'infrastructure': 0,
'custom_metrics': 0,
'logs': 0,
'apm': 0,
'total': 0
}
# Infrastructure (assuming Pro tier)
if 'hosts' in usage_data:
costs['infrastructure'] = usage_data['hosts'].get('total_hosts', 0) * self.PRICING['infrastructure_pro']
# Custom metrics
if 'custom_metrics' in usage_data:
billable = usage_data['custom_metrics'].get('billable_metrics', 0)
costs['custom_metrics'] = billable * self.PRICING['custom_metric']
# Logs
if 'logs' in usage_data:
monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0)
costs['logs'] = monthly_gb * self.PRICING['log_ingestion']
costs['total'] = sum(costs.values())
return costs
def get_recommendations(self, usage_data: Dict[str, Any]) -> List[str]:
"""Generate cost optimization recommendations."""
recommendations = []
# Custom metrics recommendations
if 'custom_metrics' in usage_data:
billable = usage_data['custom_metrics'].get('billable_metrics', 0)
if billable > 500:
savings = (billable * 0.3) * self.PRICING['custom_metric'] # Assume 30% reduction possible
recommendations.append({
'category': 'Custom Metrics',
'issue': f'High custom metric count: {billable:,} billable metrics',
'action': 'Review metric tags for high cardinality, consider aggregating or dropping unused metrics',
'potential_savings': f'${savings:.2f}/month'
})
# Container vs VM recommendations
if 'hosts' in usage_data:
hosts = usage_data['hosts'].get('total_hosts', 0)
containers = usage_data['hosts'].get('container_count', 0)
if containers > hosts * 10: # Many containers per host
savings = hosts * 0.2 * self.PRICING['infrastructure_pro']
recommendations.append({
'category': 'Infrastructure',
'issue': f'{containers:,} containers running on {hosts} hosts',
'action': 'Consider using container monitoring instead of host-based (can be 50-70% cheaper)',
'potential_savings': f'${savings:.2f}/month'
})
# Unused monitors
if 'unused_monitors' in usage_data:
count = len(usage_data['unused_monitors'])
if count > 10:
recommendations.append({
'category': 'Monitors',
'issue': f'{count} monitors unused for 30+ days',
'action': 'Delete or disable unused monitors to reduce noise and improve performance',
'potential_savings': 'Operational efficiency'
})
# Log volume recommendations
if 'logs' in usage_data:
monthly_gb = usage_data['logs'].get('monthly_projected_gb', 0)
if monthly_gb > 100:
savings = (monthly_gb * 0.4) * self.PRICING['log_ingestion'] # 40% reduction
recommendations.append({
'category': 'Logs',
'issue': f'High log volume: {monthly_gb:.1f} GB/month projected',
'action': 'Review log sources, implement sampling for debug logs, exclude health checks',
'potential_savings': f'${savings:.2f}/month'
})
# Migration recommendation if costs are high
costs = self.calculate_costs(usage_data)
if costs['total'] > 5000:
oss_cost = usage_data['hosts'].get('total_hosts', 0) * 15 # Rough estimate for self-hosted
savings = costs['total'] - oss_cost
recommendations.append({
'category': 'Strategic',
'issue': f'Total monthly cost: ${costs["total"]:.2f}',
'action': 'Consider migrating to open-source stack (Prometheus + Grafana + Loki)',
'potential_savings': f'${savings:.2f}/month (~{(savings/costs["total"]*100):.0f}% reduction)'
})
return recommendations
def print_usage_summary(usage_data: Dict[str, Any]):
"""Print usage summary."""
print("\n" + "="*70)
print("📊 DATADOG USAGE SUMMARY")
print("="*70)
# Infrastructure
if 'hosts' in usage_data:
hosts = usage_data['hosts']
print(f"\n🖥️ Infrastructure:")
print(f" Total Hosts: {hosts.get('total_hosts', 0):,}")
print(f" Agent Hosts: {hosts.get('agent_hosts', 0):,}")
print(f" AWS Hosts: {hosts.get('aws_hosts', 0):,}")
print(f" Azure Hosts: {hosts.get('azure_hosts', 0):,}")
print(f" GCP Hosts: {hosts.get('gcp_hosts', 0):,}")
print(f" Containers: {hosts.get('container_count', 0):,}")
# Custom Metrics
if 'custom_metrics' in usage_data:
metrics = usage_data['custom_metrics']
print(f"\n📈 Custom Metrics:")
print(f" Total: {metrics.get('total_custom_metrics', 0):,}")
print(f" Billable: {metrics.get('billable_metrics', 0):,} (first 100 free)")
# Logs
if 'logs' in usage_data:
logs = usage_data['logs']
print(f"\n📝 Logs:")
print(f" Daily Average: {logs.get('daily_avg_gb', 0):.2f} GB")
print(f" Monthly Projected: {logs.get('monthly_projected_gb', 0):.2f} GB")
# Unused Monitors
if 'unused_monitors' in usage_data:
print(f"\n🔔 Unused Monitors:")
print(f" Count: {len(usage_data['unused_monitors'])}")
def print_cost_breakdown(costs: Dict[str, float]):
"""Print cost breakdown."""
print("\n" + "="*70)
print("💰 ESTIMATED MONTHLY COSTS")
print("="*70)
print(f"\n Infrastructure Monitoring: ${costs['infrastructure']:,.2f}")
print(f" Custom Metrics: ${costs['custom_metrics']:,.2f}")
print(f" Log Management: ${costs['logs']:,.2f}")
print(f" APM: ${costs['apm']:,.2f}")
print(f" " + "-"*40)
print(f" TOTAL: ${costs['total']:,.2f}/month")
print(f" ${costs['total']*12:,.2f}/year")
def print_recommendations(recommendations: List[Dict]):
"""Print recommendations."""
print("\n" + "="*70)
print("💡 COST OPTIMIZATION RECOMMENDATIONS")
print("="*70)
total_savings = 0
for i, rec in enumerate(recommendations, 1):
print(f"\n{i}. {rec['category']}")
print(f" Issue: {rec['issue']}")
print(f" Action: {rec['action']}")
print(f" Potential Savings: {rec['potential_savings']}")
# Extract savings amount if it's a dollar value
if '$' in rec['potential_savings']:
try:
amount = float(rec['potential_savings'].replace('$', '').replace('/month', '').replace(',', ''))
total_savings += amount
except:
pass
if total_savings > 0:
print(f"\n{'='*70}")
print(f"💵 Total Potential Monthly Savings: ${total_savings:,.2f}")
print(f"💵 Total Potential Annual Savings: ${total_savings*12:,.2f}")
print(f"{'='*70}")
def main():
parser = argparse.ArgumentParser(
description="Analyze Datadog usage and identify cost optimization opportunities",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Analyze current usage
python3 datadog_cost_analyzer.py \\
--api-key DD_API_KEY \\
--app-key DD_APP_KEY
# Use environment variables
export DD_API_KEY=your_api_key
export DD_APP_KEY=your_app_key
python3 datadog_cost_analyzer.py
# Specify site (for EU)
python3 datadog_cost_analyzer.py --site datadoghq.eu
Required Datadog Permissions:
- usage_read
- monitors_read
"""
)
parser.add_argument('--api-key',
default=os.environ.get('DD_API_KEY'),
help='Datadog API key (or set DD_API_KEY env var)')
parser.add_argument('--app-key',
default=os.environ.get('DD_APP_KEY'),
help='Datadog Application key (or set DD_APP_KEY env var)')
parser.add_argument('--site',
default='datadoghq.com',
help='Datadog site (default: datadoghq.com, EU: datadoghq.eu)')
args = parser.parse_args()
if not args.api_key or not args.app_key:
print("❌ Error: API key and Application key required")
print(" Set via --api-key and --app-key flags or DD_API_KEY and DD_APP_KEY env vars")
sys.exit(1)
print("🔍 Analyzing Datadog usage...")
print(" This may take 30-60 seconds...\n")
analyzer = DatadogCostAnalyzer(args.api_key, args.app_key, args.site)
# Gather usage data
usage_data = {}
print(" ⏳ Fetching infrastructure usage...")
usage_data['hosts'] = analyzer.get_infrastructure_hosts()
print(" ⏳ Fetching custom metrics...")
usage_data['custom_metrics'] = analyzer.get_custom_metrics()
print(" ⏳ Fetching log usage...")
usage_data['logs'] = analyzer.get_log_usage()
print(" ⏳ Finding unused monitors...")
usage_data['unused_monitors'] = analyzer.get_unused_monitors()
# Calculate costs
costs = analyzer.calculate_costs(usage_data)
# Generate recommendations
recommendations = analyzer.get_recommendations(usage_data)
# Print results
print_usage_summary(usage_data)
print_cost_breakdown(costs)
print_recommendations(recommendations)
print("\n" + "="*70)
print("✅ Analysis complete!")
print("="*70)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
Validate health check endpoints and analyze response quality.
Checks: response time, status code, response format, dependencies.
"""
import argparse
import sys
import time
import json
from typing import Dict, List, Any, Optional
from urllib.parse import urlparse
try:
import requests
except ImportError:
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
sys.exit(1)
class HealthCheckValidator:
def __init__(self, timeout: int = 5):
self.timeout = timeout
self.results = []
def validate_endpoint(self, url: str) -> Dict[str, Any]:
"""Validate a health check endpoint."""
result = {
"url": url,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"checks": [],
"warnings": [],
"errors": []
}
try:
# Make request
start_time = time.time()
response = requests.get(url, timeout=self.timeout, verify=True)
response_time = time.time() - start_time
result["status_code"] = response.status_code
result["response_time"] = response_time
# Check 1: Status code
if response.status_code == 200:
result["checks"].append("✅ Status code is 200")
else:
result["errors"].append(f"❌ Unexpected status code: {response.status_code} (expected 200)")
# Check 2: Response time
if response_time < 1.0:
result["checks"].append(f"✅ Response time: {response_time:.3f}s (< 1s)")
elif response_time < 3.0:
result["warnings"].append(f"⚠️ Slow response time: {response_time:.3f}s (should be < 1s)")
else:
result["errors"].append(f"❌ Very slow response time: {response_time:.3f}s (should be < 1s)")
# Check 3: Content type
content_type = response.headers.get('Content-Type', '')
if 'application/json' in content_type:
result["checks"].append("✅ Content-Type is application/json")
# Try to parse JSON
try:
data = response.json()
result["response_data"] = data
# Check for common health check fields
self._validate_json_structure(data, result)
except json.JSONDecodeError:
result["errors"].append("❌ Invalid JSON response")
elif 'text/plain' in content_type:
result["warnings"].append("⚠️ Content-Type is text/plain (JSON recommended)")
result["response_data"] = response.text
else:
result["warnings"].append(f"⚠️ Unexpected Content-Type: {content_type}")
# Check 4: Response headers
self._validate_headers(response.headers, result)
except requests.exceptions.Timeout:
result["errors"].append(f"❌ Request timeout (> {self.timeout}s)")
result["status_code"] = None
result["response_time"] = None
except requests.exceptions.ConnectionError:
result["errors"].append("❌ Connection error (endpoint unreachable)")
result["status_code"] = None
result["response_time"] = None
except requests.exceptions.SSLError:
result["errors"].append("❌ SSL certificate validation failed")
result["status_code"] = None
result["response_time"] = None
except Exception as e:
result["errors"].append(f"❌ Unexpected error: {str(e)}")
result["status_code"] = None
result["response_time"] = None
# Overall status
if result["errors"]:
result["overall_status"] = "UNHEALTHY"
elif result["warnings"]:
result["overall_status"] = "DEGRADED"
else:
result["overall_status"] = "HEALTHY"
return result
def _validate_json_structure(self, data: Dict[str, Any], result: Dict[str, Any]):
"""Validate JSON health check structure."""
# Check for status field
if "status" in data:
status = data["status"]
if status in ["ok", "healthy", "up", "pass"]:
result["checks"].append(f"✅ Status field present: '{status}'")
else:
result["warnings"].append(f"⚠️ Status field has unexpected value: '{status}'")
else:
result["warnings"].append("⚠️ Missing 'status' field (recommended)")
# Check for version/build info
if any(key in data for key in ["version", "build", "commit", "timestamp"]):
result["checks"].append("✅ Version/build information present")
else:
result["warnings"].append("⚠️ No version/build information (recommended)")
# Check for dependencies
if "dependencies" in data or "checks" in data or "components" in data:
result["checks"].append("✅ Dependency checks present")
# Validate dependency structure
deps = data.get("dependencies") or data.get("checks") or data.get("components")
if isinstance(deps, dict):
unhealthy_deps = []
for name, info in deps.items():
if isinstance(info, dict):
dep_status = info.get("status", "unknown")
if dep_status not in ["ok", "healthy", "up", "pass"]:
unhealthy_deps.append(name)
elif isinstance(info, str):
if info not in ["ok", "healthy", "up", "pass"]:
unhealthy_deps.append(name)
if unhealthy_deps:
result["warnings"].append(f"⚠️ Unhealthy dependencies: {', '.join(unhealthy_deps)}")
else:
result["checks"].append(f"✅ All dependencies healthy ({len(deps)} checked)")
else:
result["warnings"].append("⚠️ No dependency checks (recommended for production services)")
# Check for uptime/metrics
if any(key in data for key in ["uptime", "metrics", "stats"]):
result["checks"].append("✅ Metrics/stats present")
def _validate_headers(self, headers: Dict[str, str], result: Dict[str, Any]):
"""Validate response headers."""
# Check for caching headers
cache_control = headers.get('Cache-Control', '')
if 'no-cache' in cache_control or 'no-store' in cache_control:
result["checks"].append("✅ Caching disabled (Cache-Control: no-cache)")
else:
result["warnings"].append("⚠️ Caching not explicitly disabled (add Cache-Control: no-cache)")
def validate_multiple(self, urls: List[str]) -> List[Dict[str, Any]]:
"""Validate multiple health check endpoints."""
results = []
for url in urls:
print(f"🔍 Checking: {url}")
result = self.validate_endpoint(url)
results.append(result)
return results
def print_result(result: Dict[str, Any], verbose: bool = False):
"""Print validation result."""
status_emoji = {
"HEALTHY": "",
"DEGRADED": "⚠️",
"UNHEALTHY": ""
}
print("\n" + "="*60)
emoji = status_emoji.get(result["overall_status"], "")
print(f"{emoji} {result['overall_status']}: {result['url']}")
print("="*60)
if result.get("status_code"):
print(f"\n📊 Status Code: {result['status_code']}")
print(f"⏱️ Response Time: {result['response_time']:.3f}s")
# Print checks
if result["checks"]:
print(f"\n✅ Passed Checks:")
for check in result["checks"]:
print(f" {check}")
# Print warnings
if result["warnings"]:
print(f"\n⚠️ Warnings:")
for warning in result["warnings"]:
print(f" {warning}")
# Print errors
if result["errors"]:
print(f"\n❌ Errors:")
for error in result["errors"]:
print(f" {error}")
# Print response data if verbose
if verbose and "response_data" in result:
print(f"\n📄 Response Data:")
if isinstance(result["response_data"], dict):
print(json.dumps(result["response_data"], indent=2))
else:
print(result["response_data"])
print("="*60)
def print_summary(results: List[Dict[str, Any]]):
"""Print summary of multiple validations."""
print("\n" + "="*60)
print("📊 HEALTH CHECK VALIDATION SUMMARY")
print("="*60)
healthy = sum(1 for r in results if r["overall_status"] == "HEALTHY")
degraded = sum(1 for r in results if r["overall_status"] == "DEGRADED")
unhealthy = sum(1 for r in results if r["overall_status"] == "UNHEALTHY")
print(f"\n✅ Healthy: {healthy}/{len(results)}")
print(f"⚠️ Degraded: {degraded}/{len(results)}")
print(f"❌ Unhealthy: {unhealthy}/{len(results)}")
if results:
avg_response_time = sum(r.get("response_time", 0) for r in results if r.get("response_time")) / len(results)
print(f"\n⏱️ Average Response Time: {avg_response_time:.3f}s")
print("="*60)
def main():
parser = argparse.ArgumentParser(
description="Validate health check endpoints",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Check a single endpoint
python3 health_check_validator.py https://api.example.com/health
# Check multiple endpoints
python3 health_check_validator.py \\
https://api.example.com/health \\
https://api.example.com/readiness
# Verbose output with response data
python3 health_check_validator.py https://api.example.com/health --verbose
# Custom timeout
python3 health_check_validator.py https://api.example.com/health --timeout 10
Best Practices Checked:
✓ Returns 200 status code
✓ Response time < 1 second
✓ Returns JSON format
✓ Contains 'status' field
✓ Includes version/build info
✓ Checks dependencies
✓ Includes metrics
✓ Disables caching
"""
)
parser.add_argument('urls', nargs='+', help='Health check endpoint URL(s)')
parser.add_argument('--timeout', type=int, default=5, help='Request timeout in seconds (default: 5)')
parser.add_argument('--verbose', action='store_true', help='Show detailed response data')
args = parser.parse_args()
validator = HealthCheckValidator(timeout=args.timeout)
results = validator.validate_multiple(args.urls)
# Print individual results
for result in results:
print_result(result, args.verbose)
# Print summary if multiple endpoints
if len(results) > 1:
print_summary(results)
if __name__ == "__main__":
main()

321
scripts/log_analyzer.py Normal file
View File

@@ -0,0 +1,321 @@
#!/usr/bin/env python3
"""
Parse and analyze logs for patterns, errors, and anomalies.
Supports: error detection, frequency analysis, pattern matching.
"""
import argparse
import sys
import re
import json
from collections import Counter, defaultdict
from datetime import datetime
from typing import Dict, List, Any, Optional
from pathlib import Path
try:
from tabulate import tabulate
except ImportError:
tabulate = None
class LogAnalyzer:
# Common log level patterns
LOG_LEVELS = {
'ERROR': r'\b(ERROR|Error|error)\b',
'WARN': r'\b(WARN|Warning|warn|warning)\b',
'INFO': r'\b(INFO|Info|info)\b',
'DEBUG': r'\b(DEBUG|Debug|debug)\b',
'FATAL': r'\b(FATAL|Fatal|fatal|CRITICAL|Critical)\b'
}
# Common error patterns
ERROR_PATTERNS = {
'exception': r'Exception|exception|EXCEPTION',
'stack_trace': r'\s+at\s+.*\(.*:\d+\)',
'http_error': r'\b[45]\d{2}\b', # 4xx and 5xx HTTP codes
'timeout': r'timeout|timed out|TIMEOUT',
'connection_refused': r'connection refused|ECONNREFUSED',
'out_of_memory': r'OutOfMemoryError|OOM|out of memory',
'null_pointer': r'NullPointerException|null pointer|NPE',
'database_error': r'SQLException|database error|DB error'
}
def __init__(self, log_file: str):
self.log_file = log_file
self.lines = []
self.log_levels = Counter()
self.error_patterns = Counter()
self.timestamps = []
def parse_file(self) -> bool:
"""Parse log file."""
try:
with open(self.log_file, 'r', encoding='utf-8', errors='ignore') as f:
self.lines = f.readlines()
return True
except Exception as e:
print(f"❌ Error reading file: {e}")
return False
def analyze_log_levels(self):
"""Count log levels."""
for line in self.lines:
for level, pattern in self.LOG_LEVELS.items():
if re.search(pattern, line):
self.log_levels[level] += 1
break # Count each line only once
def analyze_error_patterns(self):
"""Detect common error patterns."""
for line in self.lines:
for pattern_name, pattern in self.ERROR_PATTERNS.items():
if re.search(pattern, line, re.IGNORECASE):
self.error_patterns[pattern_name] += 1
def extract_timestamps(self, timestamp_pattern: Optional[str] = None):
"""Extract timestamps from logs."""
if not timestamp_pattern:
# Common timestamp patterns
patterns = [
r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}', # ISO format
r'\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}', # Apache format
r'\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', # Syslog format
]
else:
patterns = [timestamp_pattern]
for line in self.lines:
for pattern in patterns:
match = re.search(pattern, line)
if match:
self.timestamps.append(match.group())
break
def find_error_lines(self, context: int = 2) -> List[Dict[str, Any]]:
"""Find error lines with context."""
errors = []
for i, line in enumerate(self.lines):
# Check if line contains error keywords
is_error = any(re.search(pattern, line, re.IGNORECASE)
for pattern in [self.LOG_LEVELS['ERROR'], self.LOG_LEVELS['FATAL']])
if is_error:
# Get context lines
start = max(0, i - context)
end = min(len(self.lines), i + context + 1)
context_lines = self.lines[start:end]
errors.append({
'line_number': i + 1,
'line': line.strip(),
'context': ''.join(context_lines)
})
return errors
def analyze_frequency(self, time_window_minutes: int = 5) -> Dict[str, Any]:
"""Analyze log frequency over time."""
if not self.timestamps:
return {"error": "No timestamps found"}
# This is a simplified version - in production you'd parse actual timestamps
total_lines = len(self.lines)
if self.timestamps:
time_span = len(self.timestamps)
avg_per_window = total_lines / max(1, time_span / time_window_minutes)
else:
avg_per_window = 0
return {
"total_lines": total_lines,
"timestamps_found": len(self.timestamps),
"avg_per_window": avg_per_window
}
def extract_unique_messages(self, pattern: str) -> List[str]:
"""Extract unique messages matching a pattern."""
matches = []
seen = set()
for line in self.lines:
match = re.search(pattern, line, re.IGNORECASE)
if match:
msg = match.group() if match.lastindex is None else match.group(1)
if msg not in seen:
matches.append(msg)
seen.add(msg)
return matches
def find_stack_traces(self) -> List[Dict[str, Any]]:
"""Extract complete stack traces."""
stack_traces = []
current_trace = []
in_trace = False
for i, line in enumerate(self.lines):
# Start of stack trace
if re.search(r'Exception|Error.*:', line):
if current_trace:
stack_traces.append({
'line_start': i - len(current_trace) + 1,
'trace': '\n'.join(current_trace)
})
current_trace = [line.strip()]
in_trace = True
# Stack trace continuation
elif in_trace and re.search(r'^\s+at\s+', line):
current_trace.append(line.strip())
# End of stack trace
elif in_trace:
if current_trace:
stack_traces.append({
'line_start': i - len(current_trace) + 1,
'trace': '\n'.join(current_trace)
})
current_trace = []
in_trace = False
# Add last trace if exists
if current_trace:
stack_traces.append({
'line_start': len(self.lines) - len(current_trace) + 1,
'trace': '\n'.join(current_trace)
})
return stack_traces
def print_analysis_results(analyzer: LogAnalyzer, show_errors: bool = False,
show_traces: bool = False):
"""Print analysis results."""
print("\n" + "="*60)
print("📝 LOG ANALYSIS RESULTS")
print("="*60)
print(f"\n📁 File: {analyzer.log_file}")
print(f"📊 Total Lines: {len(analyzer.lines):,}")
# Log levels
if analyzer.log_levels:
print(f"\n{'='*60}")
print("📊 LOG LEVEL DISTRIBUTION:")
print(f"{'='*60}")
level_emoji = {
'FATAL': '🔴',
'ERROR': '',
'WARN': '⚠️',
'INFO': '',
'DEBUG': '🐛'
}
for level, count in analyzer.log_levels.most_common():
emoji = level_emoji.get(level, '')
percentage = (count / len(analyzer.lines)) * 100
print(f"{emoji} {level:10s}: {count:6,} ({percentage:5.1f}%)")
# Error patterns
if analyzer.error_patterns:
print(f"\n{'='*60}")
print("🔍 ERROR PATTERNS DETECTED:")
print(f"{'='*60}")
for pattern, count in analyzer.error_patterns.most_common(10):
print(f"{pattern:20s}: {count:,} occurrences")
# Timestamps
if analyzer.timestamps:
print(f"\n{'='*60}")
print(f"⏰ Timestamps Found: {len(analyzer.timestamps):,}")
print(f" First: {analyzer.timestamps[0]}")
print(f" Last: {analyzer.timestamps[-1]}")
# Error lines
if show_errors:
errors = analyzer.find_error_lines(context=1)
if errors:
print(f"\n{'='*60}")
print(f"❌ ERROR LINES (showing first 10 of {len(errors)}):")
print(f"{'='*60}")
for error in errors[:10]:
print(f"\nLine {error['line_number']}:")
print(f" {error['line']}")
# Stack traces
if show_traces:
traces = analyzer.find_stack_traces()
if traces:
print(f"\n{'='*60}")
print(f"📚 STACK TRACES FOUND: {len(traces)}")
print(f"{'='*60}")
for i, trace in enumerate(traces[:5], 1):
print(f"\nTrace {i} (starting at line {trace['line_start']}):")
print(trace['trace'])
if i < len(traces):
print("\n" + "-"*60)
print("\n" + "="*60)
def main():
parser = argparse.ArgumentParser(
description="Analyze log files for errors, patterns, and anomalies",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic analysis
python3 log_analyzer.py application.log
# Show error lines with context
python3 log_analyzer.py application.log --show-errors
# Show stack traces
python3 log_analyzer.py application.log --show-traces
# Full analysis
python3 log_analyzer.py application.log --show-errors --show-traces
Features:
• Log level distribution (ERROR, WARN, INFO, DEBUG, FATAL)
• Common error pattern detection
• Timestamp extraction
• Error line identification with context
• Stack trace extraction
• Frequency analysis
"""
)
parser.add_argument('log_file', help='Path to log file')
parser.add_argument('--show-errors', action='store_true', help='Show error lines')
parser.add_argument('--show-traces', action='store_true', help='Show stack traces')
parser.add_argument('--timestamp-pattern', help='Custom regex for timestamp extraction')
args = parser.parse_args()
if not Path(args.log_file).exists():
print(f"❌ File not found: {args.log_file}")
sys.exit(1)
print(f"🔍 Analyzing log file: {args.log_file}")
analyzer = LogAnalyzer(args.log_file)
if not analyzer.parse_file():
sys.exit(1)
# Perform analysis
analyzer.analyze_log_levels()
analyzer.analyze_error_patterns()
analyzer.extract_timestamps(args.timestamp_pattern)
# Print results
print_analysis_results(analyzer, args.show_errors, args.show_traces)
if __name__ == "__main__":
main()

365
scripts/slo_calculator.py Normal file
View File

@@ -0,0 +1,365 @@
#!/usr/bin/env python3
"""
Calculate SLO compliance, error budgets, and burn rates.
Supports availability SLOs and latency SLOs.
"""
import argparse
import sys
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
try:
from tabulate import tabulate
except ImportError:
print("⚠️ Warning: 'tabulate' library not found. Install with: pip install tabulate")
tabulate = None
class SLOCalculator:
# SLO targets and allowed downtime per period
SLO_TARGETS = {
"90.0": {"year": 36.5, "month": 3.0, "week": 0.7, "day": 0.1}, # days
"95.0": {"year": 18.25, "month": 1.5, "week": 0.35, "day": 0.05},
"99.0": {"year": 3.65, "month": 0.3, "week": 0.07, "day": 0.01},
"99.5": {"year": 1.83, "month": 0.15, "week": 0.035, "day": 0.005},
"99.9": {"year": 0.365, "month": 0.03, "week": 0.007, "day": 0.001},
"99.95": {"year": 0.183, "month": 0.015, "week": 0.0035, "day": 0.0005},
"99.99": {"year": 0.0365, "month": 0.003, "week": 0.0007, "day": 0.0001},
}
def __init__(self, slo_target: float, period_days: int = 30):
"""
Initialize SLO calculator.
Args:
slo_target: SLO target percentage (e.g., 99.9)
period_days: Time period in days (default: 30)
"""
self.slo_target = slo_target
self.period_days = period_days
self.error_budget_minutes = self.calculate_error_budget_minutes()
def calculate_error_budget_minutes(self) -> float:
"""Calculate error budget in minutes for the period."""
total_minutes = self.period_days * 24 * 60
allowed_error_rate = (100 - self.slo_target) / 100
return total_minutes * allowed_error_rate
def calculate_availability_slo(self, total_requests: int, failed_requests: int) -> Dict[str, Any]:
"""
Calculate availability SLO compliance.
Args:
total_requests: Total number of requests
failed_requests: Number of failed requests
Returns:
Dict with SLO compliance metrics
"""
if total_requests == 0:
return {
"error": "No requests in the period",
"slo_met": False
}
success_rate = ((total_requests - failed_requests) / total_requests) * 100
error_rate = (failed_requests / total_requests) * 100
# Calculate error budget consumption
allowed_failures = total_requests * ((100 - self.slo_target) / 100)
error_budget_consumed = (failed_requests / allowed_failures) * 100 if allowed_failures > 0 else float('inf')
error_budget_remaining = max(0, 100 - error_budget_consumed)
# Determine if SLO is met
slo_met = success_rate >= self.slo_target
return {
"slo_target": self.slo_target,
"period_days": self.period_days,
"total_requests": total_requests,
"failed_requests": failed_requests,
"success_requests": total_requests - failed_requests,
"success_rate": success_rate,
"error_rate": error_rate,
"slo_met": slo_met,
"error_budget_total": allowed_failures,
"error_budget_consumed": error_budget_consumed,
"error_budget_remaining": error_budget_remaining,
"margin": success_rate - self.slo_target
}
def calculate_latency_slo(self, total_requests: int, requests_exceeding_threshold: int) -> Dict[str, Any]:
"""
Calculate latency SLO compliance.
Args:
total_requests: Total number of requests
requests_exceeding_threshold: Number of requests exceeding latency threshold
Returns:
Dict with SLO compliance metrics
"""
if total_requests == 0:
return {
"error": "No requests in the period",
"slo_met": False
}
within_threshold_rate = ((total_requests - requests_exceeding_threshold) / total_requests) * 100
# Calculate error budget consumption
allowed_slow_requests = total_requests * ((100 - self.slo_target) / 100)
error_budget_consumed = (requests_exceeding_threshold / allowed_slow_requests) * 100 if allowed_slow_requests > 0 else float('inf')
error_budget_remaining = max(0, 100 - error_budget_consumed)
slo_met = within_threshold_rate >= self.slo_target
return {
"slo_target": self.slo_target,
"period_days": self.period_days,
"total_requests": total_requests,
"requests_exceeding_threshold": requests_exceeding_threshold,
"requests_within_threshold": total_requests - requests_exceeding_threshold,
"within_threshold_rate": within_threshold_rate,
"slo_met": slo_met,
"error_budget_total": allowed_slow_requests,
"error_budget_consumed": error_budget_consumed,
"error_budget_remaining": error_budget_remaining,
"margin": within_threshold_rate - self.slo_target
}
def calculate_burn_rate(self, errors_in_window: int, requests_in_window: int, window_hours: float) -> Dict[str, Any]:
"""
Calculate error budget burn rate.
Args:
errors_in_window: Number of errors in the time window
requests_in_window: Total requests in the time window
window_hours: Size of the time window in hours
Returns:
Dict with burn rate metrics
"""
if requests_in_window == 0:
return {"error": "No requests in window"}
# Calculate actual error rate in this window
actual_error_rate = (errors_in_window / requests_in_window) * 100
# Calculate allowed error rate for SLO
allowed_error_rate = 100 - self.slo_target
# Burn rate = actual error rate / allowed error rate
burn_rate = actual_error_rate / allowed_error_rate if allowed_error_rate > 0 else float('inf')
# Calculate time to exhaustion
if burn_rate > 0:
error_budget_hours = self.error_budget_minutes / 60
hours_to_exhaustion = error_budget_hours / burn_rate
else:
hours_to_exhaustion = float('inf')
# Determine severity
if burn_rate >= 14.4: # 1 hour window, burns budget in 2 days
severity = "critical"
elif burn_rate >= 6: # 6 hour window, burns budget in 5 days
severity = "warning"
elif burn_rate >= 1:
severity = "elevated"
else:
severity = "normal"
return {
"window_hours": window_hours,
"requests_in_window": requests_in_window,
"errors_in_window": errors_in_window,
"actual_error_rate": actual_error_rate,
"allowed_error_rate": allowed_error_rate,
"burn_rate": burn_rate,
"hours_to_exhaustion": hours_to_exhaustion,
"severity": severity
}
@staticmethod
def print_slo_table():
"""Print table of common SLO targets and allowed downtime."""
if not tabulate:
print("Install tabulate for formatted output: pip install tabulate")
return
print("\n📊 SLO TARGETS AND ALLOWED DOWNTIME")
print("="*60)
headers = ["SLO", "Year", "Month", "Week", "Day"]
rows = []
for slo, downtimes in sorted(SLOCalculator.SLO_TARGETS.items(), reverse=True):
row = [
f"{slo}%",
f"{downtimes['year']:.2f} days",
f"{downtimes['month']:.2f} days",
f"{downtimes['week']:.2f} days",
f"{downtimes['day']:.2f} days"
]
rows.append(row)
print(tabulate(rows, headers=headers, tablefmt="grid"))
def print_availability_results(results: Dict[str, Any]):
"""Print availability SLO results."""
print("\n" + "="*60)
print("📊 AVAILABILITY SLO COMPLIANCE")
print("="*60)
if "error" in results:
print(f"\n❌ Error: {results['error']}")
return
status_emoji = "" if results['slo_met'] else ""
print(f"\n{status_emoji} SLO Status: {'MET' if results['slo_met'] else 'VIOLATED'}")
print(f" Target: {results['slo_target']}%")
print(f" Actual: {results['success_rate']:.3f}%")
print(f" Margin: {results['margin']:+.3f}%")
print(f"\n📈 Request Statistics:")
print(f" Total Requests: {results['total_requests']:,}")
print(f" Successful: {results['success_requests']:,}")
print(f" Failed: {results['failed_requests']:,}")
print(f" Error Rate: {results['error_rate']:.3f}%")
print(f"\n💰 Error Budget:")
budget_emoji = "" if results['error_budget_remaining'] > 20 else "⚠️" if results['error_budget_remaining'] > 0 else ""
print(f" {budget_emoji} Remaining: {results['error_budget_remaining']:.1f}%")
print(f" Consumed: {results['error_budget_consumed']:.1f}%")
print(f" Allowed Failures: {results['error_budget_total']:.0f}")
print("\n" + "="*60)
def print_burn_rate_results(results: Dict[str, Any]):
"""Print burn rate results."""
print("\n" + "="*60)
print("🔥 ERROR BUDGET BURN RATE")
print("="*60)
if "error" in results:
print(f"\n❌ Error: {results['error']}")
return
severity_emoji = {
"critical": "🔴",
"warning": "🟡",
"elevated": "🟠",
"normal": "🟢"
}
print(f"\n{severity_emoji.get(results['severity'], '')} Severity: {results['severity'].upper()}")
print(f" Burn Rate: {results['burn_rate']:.2f}x")
print(f" Time to Exhaustion: {results['hours_to_exhaustion']:.1f} hours ({results['hours_to_exhaustion']/24:.1f} days)")
print(f"\n📊 Window Statistics:")
print(f" Window: {results['window_hours']} hours")
print(f" Requests: {results['requests_in_window']:,}")
print(f" Errors: {results['errors_in_window']:,}")
print(f" Actual Error Rate: {results['actual_error_rate']:.3f}%")
print(f" Allowed Error Rate: {results['allowed_error_rate']:.3f}%")
print("\n" + "="*60)
def main():
parser = argparse.ArgumentParser(
description="Calculate SLO compliance and error budgets",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Show SLO reference table
python3 slo_calculator.py --table
# Calculate availability SLO
python3 slo_calculator.py availability \\
--slo 99.9 \\
--total-requests 1000000 \\
--failed-requests 1500 \\
--period-days 30
# Calculate latency SLO
python3 slo_calculator.py latency \\
--slo 99.5 \\
--total-requests 500000 \\
--slow-requests 3000 \\
--period-days 7
# Calculate burn rate
python3 slo_calculator.py burn-rate \\
--slo 99.9 \\
--errors 50 \\
--requests 10000 \\
--window-hours 1
"""
)
parser.add_argument('mode', nargs='?', choices=['availability', 'latency', 'burn-rate'],
help='Calculation mode')
parser.add_argument('--table', action='store_true', help='Show SLO reference table')
parser.add_argument('--slo', type=float, help='SLO target percentage (e.g., 99.9)')
parser.add_argument('--period-days', type=int, default=30, help='Period in days (default: 30)')
# Availability SLO arguments
parser.add_argument('--total-requests', type=int, help='Total number of requests')
parser.add_argument('--failed-requests', type=int, help='Number of failed requests')
# Latency SLO arguments
parser.add_argument('--slow-requests', type=int, help='Number of requests exceeding threshold')
# Burn rate arguments
parser.add_argument('--errors', type=int, help='Number of errors in window')
parser.add_argument('--requests', type=int, help='Number of requests in window')
parser.add_argument('--window-hours', type=float, help='Window size in hours')
args = parser.parse_args()
# Show table if requested
if args.table:
SLOCalculator.print_slo_table()
return
if not args.mode:
parser.print_help()
return
if not args.slo:
print("❌ --slo required")
sys.exit(1)
calculator = SLOCalculator(args.slo, args.period_days)
if args.mode == 'availability':
if not args.total_requests or args.failed_requests is None:
print("❌ --total-requests and --failed-requests required")
sys.exit(1)
results = calculator.calculate_availability_slo(args.total_requests, args.failed_requests)
print_availability_results(results)
elif args.mode == 'latency':
if not args.total_requests or args.slow_requests is None:
print("❌ --total-requests and --slow-requests required")
sys.exit(1)
results = calculator.calculate_latency_slo(args.total_requests, args.slow_requests)
print_availability_results(results) # Same format
elif args.mode == 'burn-rate':
if not all([args.errors is not None, args.requests, args.window_hours]):
print("❌ --errors, --requests, and --window-hours required")
sys.exit(1)
results = calculator.calculate_burn_rate(args.errors, args.requests, args.window_hours)
print_burn_rate_results(results)
if __name__ == "__main__":
main()