gh-ahmedasmar-devops-claude…/scripts/analyze_metrics.py

#!/usr/bin/env python3
"""
Analyze metrics from Prometheus or CloudWatch and detect anomalies.
Supports: rate of change analysis, spike detection, trend analysis.
"""

import argparse
import sys
import json
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
import statistics

try:
    import requests
except ImportError:
    print("⚠️  Warning: 'requests' library not found. Install with: pip install requests")
    sys.exit(1)

try:
    import boto3
except ImportError:
    boto3 = None


class MetricAnalyzer:
    def __init__(self, source: str, endpoint: Optional[str] = None, region: str = "us-east-1"):
        self.source = source
        self.endpoint = endpoint
        self.region = region
        if source == "cloudwatch" and boto3:
            self.cloudwatch = boto3.client('cloudwatch', region_name=region)
        elif source == "cloudwatch" and not boto3:
            print("⚠️  boto3 not installed. Install with: pip install boto3")
            sys.exit(1)

    def query_prometheus(self, query: str, hours: int = 24) -> List[Dict]:
        """Query Prometheus for metric data."""
        if not self.endpoint:
            print("❌ Prometheus endpoint required")
            sys.exit(1)

        try:
            # Query range for last N hours
            end_time = datetime.now()
            start_time = end_time - timedelta(hours=hours)

            params = {
                'query': query,
                'start': start_time.timestamp(),
                'end': end_time.timestamp(),
                'step': '5m'  # 5-minute resolution
            }

            response = requests.get(f"{self.endpoint}/api/v1/query_range", params=params, timeout=30)
            response.raise_for_status()

            data = response.json()
            if data['status'] != 'success':
                print(f"❌ Prometheus query failed: {data}")
                return []

            return data['data']['result']

        except Exception as e:
            print(f"❌ Error querying Prometheus: {e}")
            return []

    def query_cloudwatch(self, namespace: str, metric_name: str, dimensions: Dict[str, str],
                         hours: int = 24, stat: str = "Average") -> List[Dict]:
        """Query CloudWatch for metric data."""
        try:
            end_time = datetime.now()
            start_time = end_time - timedelta(hours=hours)

            dimensions_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]

            response = self.cloudwatch.get_metric_statistics(
                Namespace=namespace,
                MetricName=metric_name,
                Dimensions=dimensions_list,
                StartTime=start_time,
                EndTime=end_time,
                Period=300,  # 5-minute intervals
                Statistics=[stat]
            )

            return sorted(response['Datapoints'], key=lambda x: x['Timestamp'])

        except Exception as e:
            print(f"❌ Error querying CloudWatch: {e}")
            return []

    def detect_anomalies(self, values: List[float], sensitivity: float = 2.0) -> Dict[str, Any]:
        """Detect anomalies using standard deviation method."""
        if len(values) < 10:
            return {
                "anomalies_detected": False,
                "message": "Insufficient data points for anomaly detection"
            }

        mean = statistics.mean(values)
        stdev = statistics.stdev(values)
        threshold_upper = mean + (sensitivity * stdev)
        threshold_lower = mean - (sensitivity * stdev)

        anomalies = []
        for i, value in enumerate(values):
            if value > threshold_upper or value < threshold_lower:
                anomalies.append({
                    "index": i,
                    "value": value,
                    "deviation": abs(value - mean) / stdev if stdev > 0 else 0
                })

        return {
            "anomalies_detected": len(anomalies) > 0,
            "count": len(anomalies),
            "anomalies": anomalies,
            "stats": {
                "mean": mean,
                "stdev": stdev,
                "threshold_upper": threshold_upper,
                "threshold_lower": threshold_lower,
                "total_points": len(values)
            }
        }

    def analyze_trend(self, values: List[float]) -> Dict[str, Any]:
        """Analyze trend using simple linear regression."""
        if len(values) < 2:
            return {"trend": "unknown", "message": "Insufficient data"}

        n = len(values)
        x = list(range(n))
        x_mean = sum(x) / n
        y_mean = sum(values) / n

        numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
        denominator = sum((x[i] - x_mean) ** 2 for i in range(n))

        if denominator == 0:
            return {"trend": "flat", "slope": 0}

        slope = numerator / denominator

        # Determine trend direction
        if abs(slope) < 0.01 * y_mean:  # Less than 1% change per interval
            trend = "stable"
        elif slope > 0:
            trend = "increasing"
        else:
            trend = "decreasing"

        return {
            "trend": trend,
            "slope": slope,
            "rate_of_change": (slope / y_mean * 100) if y_mean != 0 else 0
        }


def print_results(results: Dict[str, Any]):
    """Pretty print analysis results."""
    print("\n" + "="*60)
    print("📊 METRIC ANALYSIS RESULTS")
    print("="*60)

    if "error" in results:
        print(f"\n❌ Error: {results['error']}")
        return

    print(f"\n📈 Data Points: {results.get('data_points', 0)}")

    # Trend analysis
    if "trend" in results:
        trend_emoji = {"increasing": "📈", "decreasing": "📉", "stable": "➡️"}.get(results["trend"]["trend"], "❓")
        print(f"\n{trend_emoji} Trend: {results['trend']['trend'].upper()}")
        if "rate_of_change" in results["trend"]:
            print(f"   Rate of Change: {results['trend']['rate_of_change']:.2f}% per interval")

    # Anomaly detection
    if "anomalies" in results:
        anomaly_data = results["anomalies"]
        if anomaly_data["anomalies_detected"]:
            print(f"\n⚠️  ANOMALIES DETECTED: {anomaly_data['count']}")
            print(f"   Mean: {anomaly_data['stats']['mean']:.2f}")
            print(f"   Std Dev: {anomaly_data['stats']['stdev']:.2f}")
            print(f"   Threshold: [{anomaly_data['stats']['threshold_lower']:.2f}, {anomaly_data['stats']['threshold_upper']:.2f}]")

            print("\n   Top Anomalies:")
            for anomaly in sorted(anomaly_data['anomalies'], key=lambda x: x['deviation'], reverse=True)[:5]:
                print(f"   • Index {anomaly['index']}: {anomaly['value']:.2f} ({anomaly['deviation']:.2f}σ)")
        else:
            print("\n✅ No anomalies detected")

    print("\n" + "="*60)


def main():
    parser = argparse.ArgumentParser(
        description="Analyze metrics from Prometheus or CloudWatch",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Prometheus: Analyze request rate
  python3 analyze_metrics.py prometheus \\
    --endpoint http://localhost:9090 \\
    --query 'rate(http_requests_total[5m])' \\
    --hours 24

  # CloudWatch: Analyze CPU utilization
  python3 analyze_metrics.py cloudwatch \\
    --namespace AWS/EC2 \\
    --metric CPUUtilization \\
    --dimensions InstanceId=i-1234567890abcdef0 \\
    --hours 48
        """
    )

    parser.add_argument('source', choices=['prometheus', 'cloudwatch'],
                       help='Metric source')
    parser.add_argument('--endpoint', help='Prometheus endpoint URL')
    parser.add_argument('--query', help='PromQL query')
    parser.add_argument('--namespace', help='CloudWatch namespace')
    parser.add_argument('--metric', help='CloudWatch metric name')
    parser.add_argument('--dimensions', help='CloudWatch dimensions (key=value,key2=value2)')
    parser.add_argument('--hours', type=int, default=24, help='Hours of data to analyze (default: 24)')
    parser.add_argument('--sensitivity', type=float, default=2.0,
                       help='Anomaly detection sensitivity (std deviations, default: 2.0)')
    parser.add_argument('--region', default='us-east-1', help='AWS region (default: us-east-1)')

    args = parser.parse_args()

    analyzer = MetricAnalyzer(args.source, args.endpoint, args.region)

    # Query metrics
    if args.source == 'prometheus':
        if not args.query:
            print("❌ --query required for Prometheus")
            sys.exit(1)

        print(f"🔍 Querying Prometheus: {args.query}")
        results = analyzer.query_prometheus(args.query, args.hours)

        if not results:
            print("❌ No data returned")
            sys.exit(1)

        # Extract values from first result series
        values = [float(v[1]) for v in results[0].get('values', [])]

    elif args.source == 'cloudwatch':
        if not all([args.namespace, args.metric, args.dimensions]):
            print("❌ --namespace, --metric, and --dimensions required for CloudWatch")
            sys.exit(1)

        dims = dict(item.split('=') for item in args.dimensions.split(','))

        print(f"🔍 Querying CloudWatch: {args.namespace}/{args.metric}")
        results = analyzer.query_cloudwatch(args.namespace, args.metric, dims, args.hours)

        if not results:
            print("❌ No data returned")
            sys.exit(1)

        values = [point['Average'] for point in results]

    # Analyze metrics
    analysis_results = {
        "data_points": len(values),
        "trend": analyzer.analyze_trend(values),
        "anomalies": analyzer.detect_anomalies(values, args.sensitivity)
    }

    print_results(analysis_results)


if __name__ == "__main__":
    main()