Initial commit
This commit is contained in:
279
scripts/analyze_metrics.py
Normal file
279
scripts/analyze_metrics.py
Normal file
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze metrics from Prometheus or CloudWatch and detect anomalies.
|
||||
Supports: rate of change analysis, spike detection, trend analysis.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional
|
||||
import statistics
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("⚠️ Warning: 'requests' library not found. Install with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import boto3
|
||||
except ImportError:
|
||||
boto3 = None
|
||||
|
||||
|
||||
class MetricAnalyzer:
|
||||
def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"):
|
||||
self.source = source
|
||||
self.endpoint = endpoint
|
||||
self.region = region
|
||||
if source == "cloudwatch" and boto3:
|
||||
self.cloudwatch = boto3.client('cloudwatch', region_name=region)
|
||||
elif source == "cloudwatch" and not boto3:
|
||||
print("⚠️ boto3 not installed. Install with: pip install boto3")
|
||||
sys.exit(1)
|
||||
|
||||
def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]:
|
||||
"""Query Prometheus for metric data."""
|
||||
if not self.endpoint:
|
||||
print("❌ Prometheus endpoint required")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Query range for last N hours
|
||||
end_time = datetime.now()
|
||||
start_time = end_time - timedelta(hours=hours)
|
||||
|
||||
params = {
|
||||
'query': query,
|
||||
'start': start_time.timestamp(),
|
||||
'end': end_time.timestamp(),
|
||||
'step': '5m' # 5-minute resolution
|
||||
}
|
||||
|
||||
response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data['status'] != 'success':
|
||||
print(f"❌ Prometheus query failed: {data}")
|
||||
return []
|
||||
|
||||
return data['data']['result']
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error querying Prometheus: {e}")
|
||||
return []
|
||||
|
||||
def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str],
|
||||
hours: int = 24, stat: str = "Average") -> List[Dict]:
|
||||
"""Query CloudWatch for metric data."""
|
||||
try:
|
||||
end_time = datetime.now()
|
||||
start_time = end_time - timedelta(hours=hours)
|
||||
|
||||
dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
|
||||
|
||||
response = self.cloudwatch.get_metric_statistics(
|
||||
Namespace=namespace,
|
||||
MetricName=metric_name,
|
||||
Dimensions=dimensions_list,
|
||||
StartTime=start_time,
|
||||
EndTime=end_time,
|
||||
Period=300, # 5-minute intervals
|
||||
Statistics=[stat]
|
||||
)
|
||||
|
||||
return sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error querying CloudWatch: {e}")
|
||||
return []
|
||||
|
||||
def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]:
|
||||
"""Detect anomalies using standard deviation method."""
|
||||
if len(values) < 10:
|
||||
return {
|
||||
"anomalies_detected": False,
|
||||
"message": "Insufficient data points for anomaly detection"
|
||||
}
|
||||
|
||||
mean = statistics.mean(values)
|
||||
stdev = statistics.stdev(values)
|
||||
threshold_upper = mean + (sensitivity * stdev)
|
||||
threshold_lower = mean - (sensitivity * stdev)
|
||||
|
||||
anomalies = []
|
||||
for i, value in enumerate(values):
|
||||
if value > threshold_upper or value < threshold_lower:
|
||||
anomalies.append({
|
||||
"index": i,
|
||||
"value": value,
|
||||
"deviation": abs(value - mean) / stdev if stdev > 0 else 0
|
||||
})
|
||||
|
||||
return {
|
||||
"anomalies_detected": len(anomalies) > 0,
|
||||
"count": len(anomalies),
|
||||
"anomalies": anomalies,
|
||||
"stats": {
|
||||
"mean": mean,
|
||||
"stdev": stdev,
|
||||
"threshold_upper": threshold_upper,
|
||||
"threshold_lower": threshold_lower,
|
||||
"total_points": len(values)
|
||||
}
|
||||
}
|
||||
|
||||
def analyze_trend(self, values: List[float]) -> Dict[str, Any]:
|
||||
"""Analyze trend using simple linear regression."""
|
||||
if len(values) < 2:
|
||||
return {"trend": "unknown", "message": "Insufficient data"}
|
||||
|
||||
n = len(values)
|
||||
x = list(range(n))
|
||||
x_mean = sum(x) / n
|
||||
y_mean = sum(values) / n
|
||||
|
||||
numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
|
||||
denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
|
||||
|
||||
if denominator == 0:
|
||||
return {"trend": "flat", "slope": 0}
|
||||
|
||||
slope = numerator / denominator
|
||||
|
||||
# Determine trend direction
|
||||
if abs(slope) < 0.01 * y_mean: # Less than 1% change per interval
|
||||
trend = "stable"
|
||||
elif slope > 0:
|
||||
trend = "increasing"
|
||||
else:
|
||||
trend = "decreasing"
|
||||
|
||||
return {
|
||||
"trend": trend,
|
||||
"slope": slope,
|
||||
"rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0
|
||||
}
|
||||
|
||||
|
||||
def print_results(results: Dict[str, Any]):
|
||||
"""Pretty print analysis results."""
|
||||
print("\n" + "="*60)
|
||||
print("📊 METRIC ANALYSIS RESULTS")
|
||||
print("="*60)
|
||||
|
||||
if "error" in results:
|
||||
print(f"\n❌ Error: {results['error']}")
|
||||
return
|
||||
|
||||
print(f"\n📈 Data Points: {results.get('data_points', 0)}")
|
||||
|
||||
# Trend analysis
|
||||
if "trend" in results:
|
||||
trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "❓")
|
||||
print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}")
|
||||
if "rate_of_change" in results["trend"]:
|
||||
print(f" Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval")
|
||||
|
||||
# Anomaly detection
|
||||
if "anomalies" in results:
|
||||
anomaly_data = results["anomalies"]
|
||||
if anomaly_data["anomalies_detected"]:
|
||||
print(f"\n⚠️ ANOMALIES DETECTED: {anomaly_data['count']}")
|
||||
print(f" Mean: {anomaly_data['stats']['mean']:.2f}")
|
||||
print(f" Std Dev: {anomaly_data['stats']['stdev']:.2f}")
|
||||
print(f" Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]")
|
||||
|
||||
print("\n Top Anomalies:")
|
||||
for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]:
|
||||
print(f" • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)")
|
||||
else:
|
||||
print("\n✅ No anomalies detected")
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze metrics from Prometheus or CloudWatch",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Prometheus: Analyze request rate
|
||||
python3 analyze_metrics.py prometheus \\
|
||||
--endpoint http://localhost:9090 \\
|
||||
--query 'rate(http_requests_total[5m])' \\
|
||||
--hours 24
|
||||
|
||||
# CloudWatch: Analyze CPU utilization
|
||||
python3 analyze_metrics.py cloudwatch \\
|
||||
--namespace AWS/EC2 \\
|
||||
--metric CPUUtilization \\
|
||||
--dimensions InstanceId=i-1234567890abcdef0 \\
|
||||
--hours 48
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('source', choices=['prometheus', 'cloudwatch'],
|
||||
help='Metric source')
|
||||
parser.add_argument('--endpoint', help='Prometheus endpoint URL')
|
||||
parser.add_argument('--query', help='PromQL query')
|
||||
parser.add_argument('--namespace', help='CloudWatch namespace')
|
||||
parser.add_argument('--metric', help='CloudWatch metric name')
|
||||
parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)')
|
||||
parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)')
|
||||
parser.add_argument('--sensitivity', type=float, default=2.0,
|
||||
help='Anomaly detection sensitivity (std deviations, default: 2.0)')
|
||||
parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = MetricAnalyzer(args.source, args.endpoint, args.region)
|
||||
|
||||
# Query metrics
|
||||
if args.source == 'prometheus':
|
||||
if not args.query:
|
||||
print("❌ --query required for Prometheus")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"🔍 Querying Prometheus: {args.query}")
|
||||
results = analyzer.query_prometheus(args.query, args.hours)
|
||||
|
||||
if not results:
|
||||
print("❌ No data returned")
|
||||
sys.exit(1)
|
||||
|
||||
# Extract values from first result series
|
||||
values = [float(v[1]) for v in results[0].get('values', [])]
|
||||
|
||||
elif args.source == 'cloudwatch':
|
||||
if not all([args.namespace, args.metric, args.dimensions]):
|
||||
print("❌ --namespace, --metric, and --dimensions required for CloudWatch")
|
||||
sys.exit(1)
|
||||
|
||||
dims = dict(item.split('=') for item in args.dimensions.split(','))
|
||||
|
||||
print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}")
|
||||
results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours)
|
||||
|
||||
if not results:
|
||||
print("❌ No data returned")
|
||||
sys.exit(1)
|
||||
|
||||
values = [point['Average'] for point in results]
|
||||
|
||||
# Analyze metrics
|
||||
analysis_results = {
|
||||
"data_points": len(values),
|
||||
"trend": analyzer.analyze_trend(values),
|
||||
"anomalies": analyzer.detect_anomalies(values, args.sensitivity)
|
||||
}
|
||||
|
||||
print_results(analysis_results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user