Initial commit

2025-11-29 17:51:09 +08:00
commit 9d4643f587
14 changed files with 4713 additions and 0 deletions
--- a/scripts/spot_recommendations.py
+++ b/scripts/spot_recommendations.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Analyze EC2 workloads and recommend Spot instance opportunities.
+
+This script identifies:
+- Fault-tolerant workloads suitable for Spot instances
+- Potential savings from Spot vs On-Demand
+- Instances in Auto Scaling Groups (good Spot candidates)
+- Non-critical workloads based on tags
+
+Usage:
+    python3 spot_recommendations.py [--region REGION] [--profile PROFILE]
+
+Requirements:
+    pip install boto3 tabulate
+"""
+
+import argparse
+import boto3
+from datetime import datetime, timedelta
+from typing import List, Dict, Any
+from tabulate import tabulate
+import sys
+
+
+class SpotRecommendationAnalyzer:
+    def __init__(self, profile: str = None, region: str = None):
+        self.session = boto3.Session(profile_name=profile) if profile else boto3.Session()
+        self.regions = [region] if region else self._get_all_regions()
+        self.recommendations = []
+        self.total_savings = 0.0
+
+        # Average Spot savings (typically 60-90% discount)
+        self.spot_discount = 0.70  # Conservative 70% discount
+
+        # Tags that indicate Spot suitability
+        self.spot_friendly_tags = {
+            'Environment': ['dev', 'development', 'test', 'testing', 'staging', 'qa'],
+            'Workload': ['batch', 'processing', 'worker', 'ci', 'build'],
+            'CriticalLevel': ['low', 'non-critical', 'noncritical']
+        }
+
+    def _get_all_regions(self) -> List[str]:
+        """Get all enabled AWS regions."""
+        ec2 = self.session.client('ec2', region_name='us-east-1')
+        regions = ec2.describe_regions(AllRegions=False)
+        return [region['RegionName'] for region in regions['Regions']]
+
+    def _estimate_hourly_cost(self, instance_type: str) -> float:
+        """Rough estimate of hourly cost."""
+        cost_map = {
+            't3.micro': 0.0104, 't3.small': 0.0208, 't3.medium': 0.0416,
+            't3.large': 0.0832, 't3.xlarge': 0.1664, 't3.2xlarge': 0.3328,
+            'm5.large': 0.096, 'm5.xlarge': 0.192, 'm5.2xlarge': 0.384,
+            'm5.4xlarge': 0.768, 'm5.8xlarge': 1.536,
+            'c5.large': 0.085, 'c5.xlarge': 0.17, 'c5.2xlarge': 0.34,
+            'c5.4xlarge': 0.68, 'c5.9xlarge': 1.53,
+            'r5.large': 0.126, 'r5.xlarge': 0.252, 'r5.2xlarge': 0.504,
+            'r5.4xlarge': 1.008, 'r5.8xlarge': 2.016,
+        }
+
+        # Default fallback
+        if instance_type not in cost_map:
+            family = instance_type.split('.')[0]
+            family_defaults = {'t3': 0.04, 'm5': 0.10, 'c5': 0.09, 'r5': 0.13}
+            return family_defaults.get(family, 0.10)
+
+        return cost_map[instance_type]
+
+    def _calculate_suitability_score(self, instance: Dict, asg_member: bool) -> tuple:
+        """Calculate Spot suitability score (0-100) and reasons."""
+        score = 0
+        reasons = []
+
+        # Check if in Auto Scaling Group (high suitability)
+        if asg_member:
+            score += 40
+            reasons.append("Part of Auto Scaling Group")
+
+        # Check tags for environment/workload type
+        tags = {tag['Key']: tag['Value'].lower() for tag in instance.get('Tags', [])}
+
+        for key, spot_values in self.spot_friendly_tags.items():
+            if key in tags and tags[key] in spot_values:
+                score += 20
+                reasons.append(f"{key}={tags[key]}")
+
+        # Check instance age (older instances might be more stable)
+        launch_time = instance['LaunchTime']
+        days_running = (datetime.now(launch_time.tzinfo) - launch_time).days
+        if days_running > 30:
+            score += 10
+            reasons.append(f"Running {days_running} days (stable)")
+
+        # Check instance size (smaller instances have better Spot availability)
+        instance_type = instance['InstanceType']
+        if any(size in instance_type for size in ['micro', 'small', 'medium', 'large']):
+            score += 15
+            reasons.append("Standard size (good Spot availability)")
+
+        # Default baseline
+        if not reasons:
+            score = 30
+            reasons.append("General compute workload")
+
+        return min(score, 100), reasons
+
+    def analyze_instances(self):
+        """Analyze EC2 instances for Spot opportunities."""
+        print(f"\nAnalyzing EC2 instances across {len(self.regions)} region(s)...")
+
+        for region in self.regions:
+            try:
+                ec2 = self.session.client('ec2', region_name=region)
+                autoscaling = self.session.client('autoscaling', region_name=region)
+
+                # Get all Auto Scaling Groups
+                asg_instances = set()
+                try:
+                    asgs = autoscaling.describe_auto_scaling_groups()
+                    for asg in asgs['AutoScalingGroups']:
+                        for instance in asg['Instances']:
+                            asg_instances.add(instance['InstanceId'])
+                except Exception:
+                    pass
+
+                # Get all running On-Demand instances
+                instances = ec2.describe_instances(
+                    Filters=[
+                        {'Name': 'instance-state-name', 'Values': ['running']},
+                        {'Name': 'instance-lifecycle', 'Values': ['on-demand', 'scheduled']}
+                    ]
+                )
+
+                for reservation in instances['Reservations']:
+                    for instance in reservation['Instances']:
+                        instance_id = instance['InstanceId']
+                        instance_type = instance['InstanceType']
+                        asg_member = instance_id in asg_instances
+
+                        # Calculate suitability
+                        score, reasons = self._calculate_suitability_score(instance, asg_member)
+
+                        # Calculate savings
+                        hourly_cost = self._estimate_hourly_cost(instance_type)
+                        monthly_savings = hourly_cost * 730 * self.spot_discount
+                        annual_savings = monthly_savings * 12
+
+                        self.total_savings += annual_savings
+
+                        # Get instance name
+                        name_tag = next((tag['Value'] for tag in instance.get('Tags', [])
+                                       if tag['Key'] == 'Name'), 'N/A')
+
+                        # Determine recommendation
+                        if score >= 70:
+                            recommendation = "Highly Recommended"
+                        elif score >= 50:
+                            recommendation = "Recommended"
+                        elif score >= 30:
+                            recommendation = "Consider (with caution)"
+                        else:
+                            recommendation = "Not Recommended"
+
+                        self.recommendations.append({
+                            'Region': region,
+                            'Instance ID': instance_id,
+                            'Name': name_tag,
+                            'Type': instance_type,
+                            'In ASG': 'Yes' if asg_member else 'No',
+                            'Suitability Score': f"{score}/100",
+                            'Monthly Savings': f"${monthly_savings:.2f}",
+                            'Recommendation': recommendation,
+                            'Reasons': ', '.join(reasons[:2])  # Show top 2 reasons
+                        })
+
+            except Exception as e:
+                print(f"  Error scanning {region}: {str(e)}")
+
+        print(f"  Analyzed {len(self.recommendations)} instances")
+
+    def print_report(self):
+        """Print Spot recommendations report."""
+        print("\n" + "="*120)
+        print("SPOT INSTANCE RECOMMENDATIONS")
+        print("="*120)
+
+        # Sort by suitability score (descending)
+        sorted_recs = sorted(self.recommendations,
+                           key=lambda x: int(x['Suitability Score'].split('/')[0]),
+                           reverse=True)
+
+        if sorted_recs:
+            print(tabulate(sorted_recs, headers='keys', tablefmt='grid'))
+
+        print("\n" + "="*120)
+        print(f"TOTAL ANNUAL SAVINGS POTENTIAL: ${self.total_savings:.2f}")
+        print(f"(Assumes {int(self.spot_discount*100)}% average Spot discount)")
+        print("="*120)
+
+        print("\n\nSPOT INSTANCE BEST PRACTICES:")
+        print("\n1. Use Spot Instances for:")
+        print("   - Stateless applications")
+        print("   - Batch processing jobs")
+        print("   - CI/CD and build servers")
+        print("   - Data analysis and processing")
+        print("   - Dev/test/staging environments")
+        print("   - Auto Scaling Groups with mixed instance types")
+
+        print("\n2. Do NOT use Spot Instances for:")
+        print("   - Databases without replicas")
+        print("   - Stateful applications without checkpointing")
+        print("   - Real-time, latency-sensitive services")
+        print("   - Applications that can't handle interruptions")
+
+        print("\n3. Spot Best Practices:")
+        print("   - Use Spot Fleet or Auto Scaling Groups with Spot")
+        print("   - Diversify across multiple instance types")
+        print("   - Implement graceful shutdown handlers (2-minute warning)")
+        print("   - Use Spot Instance interruption notices")
+        print("   - Consider Spot + On-Demand mix (e.g., 70/30)")
+        print("   - Set appropriate max price (typically On-Demand price)")
+
+        print("\n4. Implementation Steps:")
+        print("   - Test Spot behavior in non-production first")
+        print("   - Implement interruption handling in your application")
+        print("   - Use EC2 Fleet or Auto Scaling with mixed instances policy")
+        print("   - Monitor Spot interruption rates")
+        print("   - Set up CloudWatch alarms for Spot terminations")
+
+        print("\n5. Tools to Use:")
+        print("   - EC2 Spot Instance Advisor (check interruption rates)")
+        print("   - Auto Scaling Groups with mixed instances policy")
+        print("   - Spot Fleet for diverse instance type selection")
+        print("   - AWS Spot Instances best practices guide")
+
+    def run(self):
+        """Run Spot analysis."""
+        print("="*80)
+        print("AWS SPOT INSTANCE OPPORTUNITY ANALYZER")
+        print("="*80)
+
+        self.analyze_instances()
+        self.print_report()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Analyze EC2 workloads for Spot instance opportunities',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Analyze all regions with default profile
+  python3 spot_recommendations.py
+
+  # Analyze specific region
+  python3 spot_recommendations.py --region us-east-1
+
+  # Use named profile
+  python3 spot_recommendations.py --profile production
+        """
+    )
+
+    parser.add_argument('--region', help='AWS region (default: all regions)')
+    parser.add_argument('--profile', help='AWS profile name (default: default profile)')
+
+    args = parser.parse_args()
+
+    try:
+        analyzer = SpotRecommendationAnalyzer(
+            profile=args.profile,
+            region=args.region
+        )
+        analyzer.run()
+    except Exception as e:
+        print(f"Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()