Initial commit

2025-11-29 17:51:09 +08:00
commit 9d4643f587
14 changed files with 4713 additions and 0 deletions
--- a/scripts/rightsizing_analyzer.py
+++ b/scripts/rightsizing_analyzer.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+"""
+Analyze EC2 and RDS instances for rightsizing opportunities.
+
+This script identifies:
+- Oversized EC2 instances (low CPU/memory utilization)
+- Oversized RDS instances (low CPU/connection utilization)
+- Recommended smaller instance types
+- Potential cost savings
+
+Usage:
+    python3 rightsizing_analyzer.py [--region REGION] [--profile PROFILE] [--days DAYS]
+
+Requirements:
+    pip install boto3 tabulate
+"""
+
+import argparse
+import boto3
+from datetime import datetime, timedelta
+from typing import List, Dict, Any, Optional
+from tabulate import tabulate
+import sys
+
+
+class RightsizingAnalyzer:
+    def __init__(self, profile: str = None, region: str = None, days: int = 14):
+        self.session = boto3.Session(profile_name=profile) if profile else boto3.Session()
+        self.regions = [region] if region else self._get_all_regions()
+        self.days = days
+        self.findings = {
+            'ec2': [],
+            'rds': []
+        }
+        self.total_savings = 0.0
+
+        # CPU thresholds for rightsizing
+        self.cpu_thresholds = {
+            'underutilized': 15,  # < 15% avg CPU
+            'low': 30,            # < 30% avg CPU
+        }
+
+    def _get_all_regions(self) -> List[str]:
+        """Get all enabled AWS regions."""
+        ec2 = self.session.client('ec2', region_name='us-east-1')
+        regions = ec2.describe_regions(AllRegions=False)
+        return [region['RegionName'] for region in regions['Regions']]
+
+    def _estimate_hourly_cost(self, instance_type: str) -> float:
+        """Rough estimate of hourly cost."""
+        cost_map = {
+            't3.micro': 0.0104, 't3.small': 0.0208, 't3.medium': 0.0416,
+            't3.large': 0.0832, 't3.xlarge': 0.1664, 't3.2xlarge': 0.3328,
+            'm5.large': 0.096, 'm5.xlarge': 0.192, 'm5.2xlarge': 0.384,
+            'm5.4xlarge': 0.768, 'm5.8xlarge': 1.536, 'm5.12xlarge': 2.304,
+            'm5.16xlarge': 3.072, 'm5.24xlarge': 4.608,
+            'c5.large': 0.085, 'c5.xlarge': 0.17, 'c5.2xlarge': 0.34,
+            'c5.4xlarge': 0.68, 'c5.9xlarge': 1.53, 'c5.12xlarge': 2.04,
+            'c5.18xlarge': 3.06, 'c5.24xlarge': 4.08,
+            'r5.large': 0.126, 'r5.xlarge': 0.252, 'r5.2xlarge': 0.504,
+            'r5.4xlarge': 1.008, 'r5.8xlarge': 2.016, 'r5.12xlarge': 3.024,
+            'r5.16xlarge': 4.032, 'r5.24xlarge': 6.048,
+        }
+
+        if instance_type not in cost_map:
+            family = instance_type.split('.')[0]
+            family_defaults = {'t3': 0.04, 'm5': 0.20, 'c5': 0.17, 'r5': 0.25}
+            return family_defaults.get(family, 0.10)
+
+        return cost_map[instance_type]
+
+    def _get_smaller_instance_type(self, current_type: str) -> Optional[str]:
+        """Suggest a smaller instance type."""
+        # Size progression within families
+        sizes = ['nano', 'micro', 'small', 'medium', 'large', 'xlarge', '2xlarge',
+                 '3xlarge', '4xlarge', '8xlarge', '9xlarge', '12xlarge', '16xlarge',
+                 '18xlarge', '24xlarge', '32xlarge']
+
+        parts = current_type.split('.')
+        if len(parts) != 2:
+            return None
+
+        family, size = parts
+
+        if size not in sizes:
+            return None
+
+        current_idx = sizes.index(size)
+        if current_idx <= 0:
+            return None  # Already at smallest
+
+        # Go down one size
+        new_size = sizes[current_idx - 1]
+        return f"{family}.{new_size}"
+
+    def analyze_ec2_instances(self):
+        """Analyze EC2 instances for rightsizing."""
+        print(f"\n[1/2] Analyzing EC2 instances (last {self.days} days)...")
+
+        for region in self.regions:
+            try:
+                ec2 = self.session.client('ec2', region_name=region)
+                cloudwatch = self.session.client('cloudwatch', region_name=region)
+
+                instances = ec2.describe_instances(
+                    Filters=[{'Name': 'instance-state-name', 'Values': ['running']}]
+                )
+
+                for reservation in instances['Reservations']:
+                    for instance in reservation['Instances']:
+                        instance_id = instance['InstanceId']
+                        instance_type = instance['InstanceType']
+
+                        # Skip smallest instances (already optimized)
+                        if any(size in instance_type for size in ['nano', 'micro', 'small']):
+                            continue
+
+                        name_tag = next((tag['Value'] for tag in instance.get('Tags', [])
+                                       if tag['Key'] == 'Name'), 'N/A')
+
+                        # Get CloudWatch metrics
+                        end_time = datetime.now()
+                        start_time = end_time - timedelta(days=self.days)
+
+                        try:
+                            # CPU Utilization
+                            cpu_metrics = cloudwatch.get_metric_statistics(
+                                Namespace='AWS/EC2',
+                                MetricName='CPUUtilization',
+                                Dimensions=[{'Name': 'InstanceId', 'Value': instance_id}],
+                                StartTime=start_time,
+                                EndTime=end_time,
+                                Period=3600,
+                                Statistics=['Average', 'Maximum']
+                            )
+
+                            if not cpu_metrics['Datapoints']:
+                                continue
+
+                            avg_cpu = sum([p['Average'] for p in cpu_metrics['Datapoints']]) / len(cpu_metrics['Datapoints'])
+                            max_cpu = max([p['Maximum'] for p in cpu_metrics['Datapoints']])
+
+                            # Check if underutilized
+                            if avg_cpu < self.cpu_thresholds['low'] and max_cpu < 60:
+                                smaller_type = self._get_smaller_instance_type(instance_type)
+
+                                if smaller_type:
+                                    current_cost = self._estimate_hourly_cost(instance_type)
+                                    new_cost = self._estimate_hourly_cost(smaller_type)
+                                    monthly_savings = (current_cost - new_cost) * 730
+                                    annual_savings = monthly_savings * 12
+
+                                    self.total_savings += annual_savings
+
+                                    # Determine severity
+                                    if avg_cpu < self.cpu_thresholds['underutilized']:
+                                        severity = "High"
+                                    else:
+                                        severity = "Medium"
+
+                                    self.findings['ec2'].append({
+                                        'Region': region,
+                                        'Instance ID': instance_id,
+                                        'Name': name_tag,
+                                        'Current Type': instance_type,
+                                        'Recommended Type': smaller_type,
+                                        'Avg CPU (%)': f"{avg_cpu:.1f}",
+                                        'Max CPU (%)': f"{max_cpu:.1f}",
+                                        'Monthly Savings': f"${monthly_savings:.2f}",
+                                        'Severity': severity
+                                    })
+
+                        except Exception as e:
+                            pass  # Skip instances without metrics
+
+            except Exception as e:
+                print(f"  Error scanning {region}: {str(e)}")
+
+        print(f"  Found {len(self.findings['ec2'])} rightsizing opportunities")
+
+    def analyze_rds_instances(self):
+        """Analyze RDS instances for rightsizing."""
+        print(f"\n[2/2] Analyzing RDS instances (last {self.days} days)...")
+
+        for region in self.regions:
+            try:
+                rds = self.session.client('rds', region_name=region)
+                cloudwatch = self.session.client('cloudwatch', region_name=region)
+
+                instances = rds.describe_db_instances()
+
+                for instance in instances['DBInstances']:
+                    instance_id = instance['DBInstanceIdentifier']
+                    instance_class = instance['DBInstanceClass']
+                    engine = instance['Engine']
+
+                    # Skip smallest instances
+                    if any(size in instance_class for size in ['micro', 'small']):
+                        continue
+
+                    # Get CloudWatch metrics
+                    end_time = datetime.now()
+                    start_time = end_time - timedelta(days=self.days)
+
+                    try:
+                        # CPU Utilization
+                        cpu_metrics = cloudwatch.get_metric_statistics(
+                            Namespace='AWS/RDS',
+                            MetricName='CPUUtilization',
+                            Dimensions=[{'Name': 'DBInstanceIdentifier', 'Value': instance_id}],
+                            StartTime=start_time,
+                            EndTime=end_time,
+                            Period=3600,
+                            Statistics=['Average', 'Maximum']
+                        )
+
+                        # Database Connections
+                        conn_metrics = cloudwatch.get_metric_statistics(
+                            Namespace='AWS/RDS',
+                            MetricName='DatabaseConnections',
+                            Dimensions=[{'Name': 'DBInstanceIdentifier', 'Value': instance_id}],
+                            StartTime=start_time,
+                            EndTime=end_time,
+                            Period=3600,
+                            Statistics=['Average', 'Maximum']
+                        )
+
+                        if not cpu_metrics['Datapoints']:
+                            continue
+
+                        avg_cpu = sum([p['Average'] for p in cpu_metrics['Datapoints']]) / len(cpu_metrics['Datapoints'])
+                        max_cpu = max([p['Maximum'] for p in cpu_metrics['Datapoints']])
+
+                        avg_conns = 0
+                        max_conns = 0
+                        if conn_metrics['Datapoints']:
+                            avg_conns = sum([p['Average'] for p in conn_metrics['Datapoints']]) / len(conn_metrics['Datapoints'])
+                            max_conns = max([p['Maximum'] for p in conn_metrics['Datapoints']])
+
+                        # Check if underutilized
+                        if avg_cpu < self.cpu_thresholds['low'] and max_cpu < 60:
+                            smaller_class = self._get_smaller_instance_type(instance_class)
+
+                            if smaller_class:
+                                # RDS pricing is roughly 2x EC2
+                                base_type = instance_class.replace('db.', '')
+                                current_cost = self._estimate_hourly_cost(base_type) * 2
+                                new_base = smaller_class.replace('db.', '')
+                                new_cost = self._estimate_hourly_cost(new_base) * 2
+
+                                monthly_savings = (current_cost - new_cost) * 730
+                                annual_savings = monthly_savings * 12
+
+                                self.total_savings += annual_savings
+
+                                # Determine severity
+                                if avg_cpu < self.cpu_thresholds['underutilized']:
+                                    severity = "High"
+                                else:
+                                    severity = "Medium"
+
+                                self.findings['rds'].append({
+                                    'Region': region,
+                                    'Instance ID': instance_id,
+                                    'Engine': engine,
+                                    'Current Class': instance_class,
+                                    'Recommended Class': smaller_class,
+                                    'Avg CPU (%)': f"{avg_cpu:.1f}",
+                                    'Max CPU (%)': f"{max_cpu:.1f}",
+                                    'Avg Connections': f"{avg_conns:.0f}",
+                                    'Monthly Savings': f"${monthly_savings:.2f}",
+                                    'Severity': severity
+                                })
+
+                    except Exception as e:
+                        pass  # Skip instances without metrics
+
+            except Exception as e:
+                print(f"  Error scanning {region}: {str(e)}")
+
+        print(f"  Found {len(self.findings['rds'])} rightsizing opportunities")
+
+    def print_report(self):
+        """Print rightsizing report."""
+        print("\n" + "="*110)
+        print("RIGHTSIZING RECOMMENDATIONS")
+        print("="*110)
+
+        if self.findings['ec2']:
+            print("\nEC2 RIGHTSIZING OPPORTUNITIES")
+            print("-" * 110)
+            sorted_ec2 = sorted(self.findings['ec2'],
+                              key=lambda x: float(x['Monthly Savings'].replace('$', '')),
+                              reverse=True)
+            print(tabulate(sorted_ec2, headers='keys', tablefmt='grid'))
+
+        if self.findings['rds']:
+            print("\nRDS RIGHTSIZING OPPORTUNITIES")
+            print("-" * 110)
+            sorted_rds = sorted(self.findings['rds'],
+                              key=lambda x: float(x['Monthly Savings'].replace('$', '')),
+                              reverse=True)
+            print(tabulate(sorted_rds, headers='keys', tablefmt='grid'))
+
+        print("\n" + "="*110)
+        print(f"TOTAL ANNUAL SAVINGS: ${self.total_savings:.2f}")
+        print("="*110)
+
+        print("\n\nRIGHTSIZING BEST PRACTICES:")
+        print("\n1. Before Rightsizing:")
+        print("   - Review metrics over longer period (30+ days recommended)")
+        print("   - Check for seasonal patterns or cyclical workloads")
+        print("   - Verify that current size isn't required for burst capacity")
+        print("   - Review application performance requirements")
+
+        print("\n2. Rightsizing Process:")
+        print("   - Test in non-production environment first")
+        print("   - Schedule during maintenance window")
+        print("   - EC2: Stop instance → Change type → Start")
+        print("   - RDS: Modify instance (causes brief downtime)")
+        print("   - Monitor performance after change")
+
+        print("\n3. Important Considerations:")
+        print("   - Some instance families can't be changed (requires new instance)")
+        print("   - EBS-optimized settings may change with instance type")
+        print("   - Network performance varies by instance size")
+        print("   - Consider vertical scaling limits vs horizontal scaling")
+
+        print("\n4. Alternative Approaches:")
+        print("   - Consider serverless options (Lambda, Fargate, Aurora Serverless)")
+        print("   - Use Auto Scaling to match capacity to demand")
+        print("   - Implement horizontal scaling instead of larger instances")
+        print("   - Evaluate containerization for better resource utilization")
+
+    def run(self):
+        """Run rightsizing analysis."""
+        print(f"Analyzing AWS resources for rightsizing opportunities...")
+        print(f"Metrics period: {self.days} days")
+        print(f"Scanning {len(self.regions)} region(s)...\n")
+
+        self.analyze_ec2_instances()
+        self.analyze_rds_instances()
+
+        self.print_report()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Analyze AWS resources for rightsizing opportunities',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Analyze all regions (14 days of metrics)
+  python3 rightsizing_analyzer.py
+
+  # Analyze with 30 days of metrics for better accuracy
+  python3 rightsizing_analyzer.py --days 30
+
+  # Analyze specific region
+  python3 rightsizing_analyzer.py --region us-east-1
+
+  # Use named profile
+  python3 rightsizing_analyzer.py --profile production
+        """
+    )
+
+    parser.add_argument('--region', help='AWS region (default: all regions)')
+    parser.add_argument('--profile', help='AWS profile name (default: default profile)')
+    parser.add_argument('--days', type=int, default=14,
+                        help='Days of metrics to analyze (default: 14)')
+
+    args = parser.parse_args()
+
+    try:
+        analyzer = RightsizingAnalyzer(
+            profile=args.profile,
+            region=args.region,
+            days=args.days
+        )
+        analyzer.run()
+    except Exception as e:
+        print(f"Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()