Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions

279
scripts/analyze_metrics.py Normal file
View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
Analyze metrics from Prometheus or CloudWatch and detect anomalies.
Supports: rate of change analysis, spike detection, trend analysis.
"""
import argparse
import sys
import json
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
import statistics
try:
import requests
except ImportError:
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
sys.exit(1)
try:
import boto3
except ImportError:
boto3 = None
class MetricAnalyzer:
def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"):
self.source = source
self.endpoint = endpoint
self.region = region
if source == "cloudwatch" and boto3:
self.cloudwatch = boto3.client('cloudwatch', region_name=region)
elif source == "cloudwatch" and not boto3:
print("⚠️ boto3 not installed. Install with: pip install boto3")
sys.exit(1)
def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]:
"""Query Prometheus for metric data."""
if not self.endpoint:
print("❌ Prometheus endpoint required")
sys.exit(1)
try:
# Query range for last N hours
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
params = {
'query': query,
'start': start_time.timestamp(),
'end': end_time.timestamp(),
'step': '5m' # 5-minute resolution
}
response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30)
response.raise_for_status()
data = response.json()
if data['status'] != 'success':
print(f"❌ Prometheus query failed: {data}")
return []
return data['data']['result']
except Exception as e:
print(f"❌ Error querying Prometheus: {e}")
return []
def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str],
hours: int = 24, stat: str = "Average") -> List[Dict]:
"""Query CloudWatch for metric data."""
try:
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
response = self.cloudwatch.get_metric_statistics(
Namespace=namespace,
MetricName=metric_name,
Dimensions=dimensions_list,
StartTime=start_time,
EndTime=end_time,
Period=300, # 5-minute intervals
Statistics=[stat]
)
return sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
except Exception as e:
print(f"❌ Error querying CloudWatch: {e}")
return []
def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]:
"""Detect anomalies using standard deviation method."""
if len(values) < 10:
return {
"anomalies_detected": False,
"message": "Insufficient data points for anomaly detection"
}
mean = statistics.mean(values)
stdev = statistics.stdev(values)
threshold_upper = mean + (sensitivity * stdev)
threshold_lower = mean - (sensitivity * stdev)
anomalies = []
for i, value in enumerate(values):
if value > threshold_upper or value < threshold_lower:
anomalies.append({
"index": i,
"value": value,
"deviation": abs(value - mean) / stdev if stdev > 0 else 0
})
return {
"anomalies_detected": len(anomalies) > 0,
"count": len(anomalies),
"anomalies": anomalies,
"stats": {
"mean": mean,
"stdev": stdev,
"threshold_upper": threshold_upper,
"threshold_lower": threshold_lower,
"total_points": len(values)
}
}
def analyze_trend(self, values: List[float]) -> Dict[str, Any]:
"""Analyze trend using simple linear regression."""
if len(values) < 2:
return {"trend": "unknown", "message": "Insufficient data"}
n = len(values)
x = list(range(n))
x_mean = sum(x) / n
y_mean = sum(values) / n
numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
if denominator == 0:
return {"trend": "flat", "slope": 0}
slope = numerator / denominator
# Determine trend direction
if abs(slope) < 0.01 * y_mean: # Less than 1% change per interval
trend = "stable"
elif slope > 0:
trend = "increasing"
else:
trend = "decreasing"
return {
"trend": trend,
"slope": slope,
"rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0
}
def print_results(results: Dict[str, Any]):
"""Pretty print analysis results."""
print("\n" + "="*60)
print("📊 METRIC ANALYSIS RESULTS")
print("="*60)
if "error" in results:
print(f"\n❌ Error: {results['error']}")
return
print(f"\n📈 Data Points: {results.get('data_points', 0)}")
# Trend analysis
if "trend" in results:
trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "")
print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}")
if "rate_of_change" in results["trend"]:
print(f" Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval")
# Anomaly detection
if "anomalies" in results:
anomaly_data = results["anomalies"]
if anomaly_data["anomalies_detected"]:
print(f"\n⚠️ ANOMALIES DETECTED: {anomaly_data['count']}")
print(f" Mean: {anomaly_data['stats']['mean']:.2f}")
print(f" Std Dev: {anomaly_data['stats']['stdev']:.2f}")
print(f" Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]")
print("\n Top Anomalies:")
for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]:
print(f" • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)")
else:
print("\n✅ No anomalies detected")
print("\n" + "="*60)
def main():
parser = argparse.ArgumentParser(
description="Analyze metrics from Prometheus or CloudWatch",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Prometheus: Analyze request rate
python3 analyze_metrics.py prometheus \\
--endpoint http://localhost:9090 \\
--query 'rate(http_requests_total[5m])' \\
--hours 24
# CloudWatch: Analyze CPU utilization
python3 analyze_metrics.py cloudwatch \\
--namespace AWS/EC2 \\
--metric CPUUtilization \\
--dimensions InstanceId=i-1234567890abcdef0 \\
--hours 48
"""
)
parser.add_argument('source', choices=['prometheus', 'cloudwatch'],
help='Metric source')
parser.add_argument('--endpoint', help='Prometheus endpoint URL')
parser.add_argument('--query', help='PromQL query')
parser.add_argument('--namespace', help='CloudWatch namespace')
parser.add_argument('--metric', help='CloudWatch metric name')
parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)')
parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)')
parser.add_argument('--sensitivity', type=float, default=2.0,
help='Anomaly detection sensitivity (std deviations, default: 2.0)')
parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)')
args = parser.parse_args()
analyzer = MetricAnalyzer(args.source, args.endpoint, args.region)
# Query metrics
if args.source == 'prometheus':
if not args.query:
print("❌ --query required for Prometheus")
sys.exit(1)
print(f"🔍 Querying Prometheus: {args.query}")
results = analyzer.query_prometheus(args.query, args.hours)
if not results:
print("❌ No data returned")
sys.exit(1)
# Extract values from first result series
values = [float(v[1]) for v in results[0].get('values', [])]
elif args.source == 'cloudwatch':
if not all([args.namespace, args.metric, args.dimensions]):
print("❌ --namespace, --metric, and --dimensions required for CloudWatch")
sys.exit(1)
dims = dict(item.split('=') for item in args.dimensions.split(','))
print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}")
results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours)
if not results:
print("❌ No data returned")
sys.exit(1)
values = [point['Average'] for point in results]
# Analyze metrics
analysis_results = {
"data_points": len(values),
"trend": analyzer.analyze_trend(values),
"anomalies": analyzer.detect_anomalies(values, args.sensitivity)
}
print_results(analysis_results)
if __name__ == "__main__":
main()