From d618be855623303db6ff666e4db3f4a766e2fd81 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 17:56:26 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 15 + README.md | 3 + commands/cost-analyze.md | 360 ++++++++++++++++++++++ commands/cost-optimize.md | 480 ++++++++++++++++++++++++++++++ plugin.lock.json | 61 ++++ skills/aws-cost-expert/SKILL.md | 416 ++++++++++++++++++++++++++ skills/cloud-pricing/SKILL.md | 325 ++++++++++++++++++++ skills/cost-optimization/SKILL.md | 337 +++++++++++++++++++++ 8 files changed, 1997 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 commands/cost-analyze.md create mode 100644 commands/cost-optimize.md create mode 100644 plugin.lock.json create mode 100644 skills/aws-cost-expert/SKILL.md create mode 100644 skills/cloud-pricing/SKILL.md create mode 100644 skills/cost-optimization/SKILL.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..f1c6f8c --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "specweave-cost-optimizer", + "description": "Cloud cost optimization and analysis for AWS, Azure, GCP, and serverless platforms. Provides cost analysis, optimization recommendations, pricing comparisons, budget alerts, and serverless cost modeling with 2024/2025 pricing.", + "version": "0.24.0", + "author": { + "name": "Anton Abyzov", + "email": "anton.abyzov@gmail.com" + }, + "skills": [ + "./skills" + ], + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cdb1f10 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# specweave-cost-optimizer + +Cloud cost optimization and analysis for AWS, Azure, GCP, and serverless platforms. Provides cost analysis, optimization recommendations, pricing comparisons, budget alerts, and serverless cost modeling with 2024/2025 pricing. diff --git a/commands/cost-analyze.md b/commands/cost-analyze.md new file mode 100644 index 0000000..e2814aa --- /dev/null +++ b/commands/cost-analyze.md @@ -0,0 +1,360 @@ +# /specweave-cost-optimizer:cost-analyze + +Analyze cloud infrastructure costs and identify optimization opportunities across AWS, Azure, and GCP. + +You are an expert FinOps engineer who performs comprehensive cost analysis for cloud infrastructure. + +## Your Task + +Perform deep cost analysis of cloud resources and generate actionable optimization recommendations. + +### 1. Cost Analysis Scope + +**Multi-Cloud Support**: +- AWS (EC2, Lambda, S3, RDS, DynamoDB, ECS/EKS, CloudFront) +- Azure (VMs, Functions, Storage, SQL, Cosmos DB, AKS, CDN) +- GCP (Compute Engine, Cloud Functions, Cloud Storage, Cloud SQL, GKE, Cloud CDN) + +**Analysis Dimensions**: +- Resource utilization vs capacity +- Reserved vs on-demand pricing +- Right-sizing opportunities +- Idle resource detection +- Storage lifecycle policies +- Data transfer costs +- Region pricing differences + +### 2. Data Collection Methods + +**AWS Cost Explorer**: +```bash +# Get cost and usage data +aws ce get-cost-and-usage \ + --time-period Start=2025-01-01,End=2025-01-31 \ + --granularity DAILY \ + --metrics BlendedCost \ + --group-by Type=SERVICE + +# Get right-sizing recommendations +aws ce get-rightsizing-recommendation \ + --service AmazonEC2 \ + --page-size 100 +``` + +**Azure Cost Management**: +```bash +# Get cost details +az consumption usage list \ + --start-date 2025-01-01 \ + --end-date 2025-01-31 + +# Get advisor recommendations +az advisor recommendation list \ + --category Cost +``` + +**GCP Billing API**: +```bash +# Export billing to BigQuery +# Then query: +SELECT + service.description as service, + SUM(cost) as total_cost +FROM `project.dataset.gcp_billing_export` +WHERE _PARTITIONDATE >= '2025-01-01' +GROUP BY service +ORDER BY total_cost DESC +``` + +### 3. Analysis Framework + +**Step 1: Resource Inventory** +- List all compute instances (EC2, VMs, Compute Engine) +- Identify database resources (RDS, SQL, Cloud SQL) +- Catalog storage (S3, Blob, Cloud Storage) +- Map serverless functions (Lambda, Functions, Cloud Functions) +- Document networking (Load Balancers, NAT Gateways, VPN) + +**Step 2: Utilization Analysis** +```typescript +interface ResourceUtilization { + resourceId: string; + resourceType: string; + cpu: { + average: number; + peak: number; + p95: number; + }; + memory: { + average: number; + peak: number; + p95: number; + }; + recommendation: 'downsize' | 'rightsize' | 'optimal' | 'upsize'; +} + +// Example thresholds +const THRESHOLDS = { + cpu: { + idle: 5, // < 5% CPU = idle + underused: 20, // < 20% CPU = undersized + optimal: 70, // 20-70% = optimal + overused: 85, // > 85% = needs upsize + }, + memory: { + idle: 10, + underused: 30, + optimal: 75, + overused: 90, + }, +}; +``` + +**Step 3: Cost Breakdown** +```typescript +interface CostBreakdown { + total: number; + byService: Record; + byEnvironment: Record; + byTeam: Record; + trends: { + mom: number; // month-over-month % + yoy: number; // year-over-year % + }; +} +``` + +### 4. Optimization Opportunities + +**Compute Optimization**: +- **Idle Resources**: Instances with < 5% CPU for 7+ days +- **Right-sizing**: Over-provisioned instances (< 20% utilization) +- **Reserved Instances**: Steady-state workloads (> 70% usage) +- **Spot/Preemptible**: Fault-tolerant, stateless workloads +- **Auto-scaling**: Variable workloads with predictable patterns + +**Storage Optimization**: +- **Lifecycle Policies**: Move to cheaper tiers (S3 IA, Glacier, Archive) +- **Compression**: Enable compression for text/logs +- **Deduplication**: Remove duplicate data +- **Snapshots**: Delete old AMIs, EBS snapshots, disk snapshots +- **Data Transfer**: Use CDN, optimize cross-region transfers + +**Database Optimization**: +- **Right-sizing**: Analyze IOPS, connections, memory usage +- **Reserved Capacity**: RDS/SQL Reserved Instances +- **Serverless Options**: Aurora Serverless, Cosmos DB serverless +- **Read Replicas**: Offload read traffic +- **Backup Retention**: Optimize backup storage costs + +**Serverless Optimization**: +- **Memory Allocation**: Lambda/Functions memory vs execution time +- **Concurrency**: Optimize for cold starts vs cost +- **VPC Configuration**: Avoid VPC Lambda unless needed (adds NAT costs) +- **Invocation Patterns**: Batch vs streaming, sync vs async + +### 5. Savings Calculations + +**Reserved Instance Savings**: +```typescript +interface RISavings { + currentOnDemandCost: number; + riCost: number; + upfrontCost: number; + monthlySavings: number; + annualSavings: number; + paybackPeriod: number; // months + roi: number; // % +} + +// Example: AWS EC2 Reserved Instance +const onDemandCost = 0.096 * 730; // t3.large on-demand/month +const ri1Year = 0.062 * 730; // t3.large 1-year RI +const savings = onDemandCost - ri1Year; // $24.82/month = $297.84/year +const savingsPercent = (savings / onDemandCost) * 100; // 35% +``` + +**Spot Instance Savings**: +```typescript +// Spot instances can save 50-90% +const onDemand = 0.096; // t3.large +const spot = 0.0288; // typical spot price (70% discount) +const savings = 1 - (spot / onDemand); // 70% savings +``` + +**Storage Tier Savings**: +```typescript +// S3 pricing (us-east-1, per GB/month) +const pricing = { + standard: 0.023, + ia: 0.0125, // Infrequent Access (54% cheaper) + glacier: 0.004, // Glacier (83% cheaper) + deepArchive: 0.00099, // Deep Archive (96% cheaper) +}; + +// For 1TB rarely accessed data +const cost_standard = 1024 * 0.023; // $23.55/month +const cost_ia = 1024 * 0.0125; // $12.80/month +const savings = cost_standard - cost_ia; // $10.75/month = $129/year +``` + +### 6. Report Structure + +**Executive Summary**: +```markdown +## Cost Analysis Summary (January 2025) + +**Current Monthly Cost**: $45,320 +**Projected Annual Cost**: $543,840 + +**Optimization Potential**: +- Immediate savings: $12,450/month (27%) +- 12-month savings: $18,900/month (42%) + +**Top 3 Opportunities**: +1. Right-size EC2 instances: $6,200/month +2. Purchase RDS Reserved Instances: $4,800/month +3. Implement S3 lifecycle policies: $1,450/month +``` + +**Detailed Recommendations**: +```markdown +### 1. Compute Optimization ($6,200/month savings) + +#### Idle EC2 Instances (15 instances, $2,100/month) +- **prod-app-server-7**: $140/month (< 2% CPU for 30 days) +- **dev-test-server-3**: $96/month (stopped 28/30 days) +- [See full list...] + +**Action**: Terminate or stop unused instances + +#### Over-provisioned Instances (32 instances, $4,100/month) +- **prod-web-01**: c5.2xlarge → c5.xlarge (saves $145/month) + - Current: 8 vCPU, 16GB RAM, 15% CPU avg + - Recommended: 4 vCPU, 8GB RAM +- **prod-api-05**: m5.4xlarge → m5.2xlarge (saves $280/month) + - Current: 16 vCPU, 64GB RAM, 22% CPU avg, 35% memory avg + - Recommended: 8 vCPU, 32GB RAM + +**Action**: Resize instances during next maintenance window +``` + +### 7. Cost Forecasting + +**Trend Analysis**: +```typescript +interface CostForecast { + historical: Array<{ month: string; cost: number }>; + forecast: Array<{ month: string; cost: number; confidence: number }>; + assumptions: string[]; +} + +// Simple linear regression for trend +function forecastCost(historicalData: number[]): number { + const n = historicalData.length; + const sumX = (n * (n + 1)) / 2; + const sumY = historicalData.reduce((a, b) => a + b, 0); + const sumXY = historicalData.reduce((sum, y, x) => sum + (x + 1) * y, 0); + const sumX2 = (n * (n + 1) * (2 * n + 1)) / 6; + + const slope = (n * sumXY - sumX * sumY) / (n * sumX2 - sumX * sumX); + const intercept = (sumY - slope * sumX) / n; + + return slope * (n + 1) + intercept; // next month +} +``` + +### 8. Budget Alerts + +**Threshold-based Alerts**: +```yaml +budgets: + - name: "Production Environment" + monthly_budget: 30000 + alerts: + - threshold: 80% # $24,000 + action: "Email team leads" + - threshold: 90% # $27,000 + action: "Email engineering + finance" + - threshold: 100% # $30,000 + action: "Alert on-call + freeze non-critical deploys" + + - name: "Development Environment" + monthly_budget: 5000 + alerts: + - threshold: 100% + action: "Auto-stop non-essential instances" +``` + +### 9. Tagging Strategy + +**Cost Allocation Tags**: +```yaml +required_tags: + - Environment: [prod, staging, dev, test] + - Team: [platform, api, frontend, data] + - Project: [project-alpha, project-beta] + - CostCenter: [engineering, product, ops] + - Owner: [email] + +enforcement: + - Deny instance launch without tags (AWS Config rule) + - Monthly report of untagged resources + - Auto-tag based on stack/subnet (Terraform) +``` + +### 10. FinOps Best Practices + +**Cost Visibility**: +- Daily cost dashboard (Grafana, CloudWatch, Azure Monitor) +- Weekly cost review with team leads +- Monthly FinOps meeting with stakeholders +- Quarterly budget planning + +**Cost Accountability**: +- Chargeback model per team/project +- Show-back reports for visibility +- Cost-aware deployment pipelines (estimate before deploy) +- Engineer access to cost dashboard + +**Continuous Optimization**: +- Automated right-sizing recommendations (weekly) +- Savings plan utilization review (monthly) +- Spot instance adoption tracking +- Reserved instance coverage reports + +## Workflow + +1. **Collect Data**: Pull cost/usage data from cloud providers (last 30-90 days) +2. **Analyze Utilization**: Calculate CPU, memory, disk, network metrics +3. **Identify Waste**: Find idle, over-provisioned, orphaned resources +4. **Calculate Savings**: Quantify potential savings per recommendation +5. **Prioritize**: Rank by savings potential and implementation effort +6. **Generate Report**: Create executive summary + detailed action plan +7. **Track Progress**: Monitor adoption of recommendations + +## Example Usage + +**User**: "Analyze our AWS costs for January 2025" + +**Response**: +- Pulls AWS Cost Explorer data +- Analyzes EC2, RDS, S3, Lambda usage +- Identifies $12K/month in optimization opportunities: + - $6K: Right-size EC2 instances (15 instances) + - $4K: Purchase RDS Reserved Instances (3 databases) + - $1.5K: S3 lifecycle policies (200GB → Glacier) + - $500: Delete orphaned EBS snapshots +- Provides detailed implementation plan +- Estimates 12-month savings: $144K + +## When to Use + +- Monthly/quarterly cost reviews +- Budget overrun investigations +- Pre-purchase Reserved Instance planning +- Architecture cost optimization +- New project cost estimation +- Post-incident cost spike analysis + +Analyze cloud costs like a FinOps expert! diff --git a/commands/cost-optimize.md b/commands/cost-optimize.md new file mode 100644 index 0000000..5dd540b --- /dev/null +++ b/commands/cost-optimize.md @@ -0,0 +1,480 @@ +# /specweave-cost-optimizer:cost-optimize + +Implement cost optimization recommendations with automated resource modifications and savings plan purchases. + +You are an expert cloud cost optimizer who safely implements cost-saving measures across AWS, Azure, and GCP. + +## Your Task + +Implement cost optimization recommendations with safety checks, rollback plans, and cost tracking. + +### 1. Optimization Categories + +**Immediate Actions (No Downtime)**: +- Terminate idle resources +- Delete orphaned resources (unattached EBS, old snapshots) +- Implement storage lifecycle policies +- Enable compression/deduplication +- Clean up unused security groups, load balancers + +**Scheduled Actions (Maintenance Window)**: +- Right-size instances (resize down/up) +- Migrate to reserved instances +- Convert EBS types (gp2 → gp3) +- Database version upgrades + +**Long-term Actions (Architecture Changes)**: +- Migrate to serverless +- Implement auto-scaling +- Multi-region optimization +- Spot/preemptible adoption + +### 2. Safety Framework + +**Pre-optimization Checks**: +```typescript +interface SafetyCheck { + resourceId: string; + checks: { + hasBackup: boolean; + hasMonitoring: boolean; + hasRollbackPlan: boolean; + impactAssessment: 'none' | 'low' | 'medium' | 'high'; + stakeholderApproval: boolean; + }; + canProceed: boolean; + blockers: string[]; +} + +// Example safety check +async function canOptimize(resource: Resource): Promise { + const checks = { + hasBackup: await hasRecentBackup(resource), + hasMonitoring: await hasActiveAlarms(resource), + hasRollbackPlan: true, // Manual rollback documented + impactAssessment: assessImpact(resource), + stakeholderApproval: resource.tags.ApprovedForOptimization === 'true', + }; + + const blockers = []; + if (!checks.hasBackup) blockers.push('Missing backup'); + if (!checks.hasMonitoring) blockers.push('No monitoring alarms'); + if (checks.impactAssessment === 'high' && !checks.stakeholderApproval) { + blockers.push('Requires stakeholder approval'); + } + + return { + resourceId: resource.id, + checks, + canProceed: blockers.length === 0, + blockers, + }; +} +``` + +**Rollback Plans**: +```typescript +interface RollbackPlan { + optimizationId: string; + originalState: any; + rollbackSteps: Array<{ + action: string; + command: string; + estimatedTime: number; + }>; + rollbackWindow: number; // hours + contactInfo: string[]; +} + +// Example: EC2 instance resize rollback +const rollback: RollbackPlan = { + optimizationId: 'opt-001', + originalState: { + instanceType: 'c5.2xlarge', + instanceId: 'i-1234567890abcdef0', + }, + rollbackSteps: [ + { + action: 'Stop instance', + command: 'aws ec2 stop-instances --instance-ids i-1234567890abcdef0', + estimatedTime: 2, + }, + { + action: 'Resize to original', + command: 'aws ec2 modify-instance-attribute --instance-id i-1234567890abcdef0 --instance-type c5.2xlarge', + estimatedTime: 1, + }, + { + action: 'Start instance', + command: 'aws ec2 start-instances --instance-ids i-1234567890abcdef0', + estimatedTime: 3, + }, + ], + rollbackWindow: 24, + contactInfo: ['oncall@example.com', 'platform-team@example.com'], +}; +``` + +### 3. Optimization Actions + +**Right-size EC2 Instance**: +```bash +#!/bin/bash +# Right-size EC2 instance with safety checks + +INSTANCE_ID="i-1234567890abcdef0" +NEW_TYPE="c5.xlarge" +OLD_TYPE=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query 'Reservations[0].Instances[0].InstanceType' --output text) + +# 1. Create AMI backup +echo "Creating backup AMI..." +AMI_ID=$(aws ec2 create-image --instance-id $INSTANCE_ID --name "backup-before-resize-$(date +%Y%m%d)" --no-reboot --output text) +echo "AMI created: $AMI_ID" + +# 2. Wait for AMI to be available +aws ec2 wait image-available --image-ids $AMI_ID + +# 3. Stop instance +echo "Stopping instance..." +aws ec2 stop-instances --instance-ids $INSTANCE_ID +aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID + +# 4. Modify instance type +echo "Resizing $OLD_TYPE -> $NEW_TYPE..." +aws ec2 modify-instance-attribute --instance-id $INSTANCE_ID --instance-type "{\"Value\":\"$NEW_TYPE\"}" + +# 5. Start instance +echo "Starting instance..." +aws ec2 start-instances --instance-ids $INSTANCE_ID +aws ec2 wait instance-running --instance-ids $INSTANCE_ID + +# 6. Health check +sleep 30 +HEALTH=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID --query 'InstanceStatuses[0].InstanceStatus.Status' --output text) + +if [ "$HEALTH" = "ok" ]; then + echo "✅ Resize successful!" +else + echo "❌ Health check failed. Rolling back..." + # Rollback logic here +fi +``` + +**Purchase Reserved Instances**: +```typescript +interface RIPurchase { + instanceType: string; + count: number; + term: '1year' | '3year'; + paymentOption: 'all-upfront' | 'partial-upfront' | 'no-upfront'; + estimatedSavings: number; + breakEvenMonths: number; +} + +// Example RI purchase decision +const riRecommendation: RIPurchase = { + instanceType: 't3.large', + count: 10, // Running 10 steady-state instances + term: '1year', + paymentOption: 'partial-upfront', + estimatedSavings: 3500, // $3,500/year + breakEvenMonths: 4, +}; + +// Purchase command +aws ec2 purchase-reserved-instances-offering \ + --reserved-instances-offering-id \ + --instance-count 10 +``` + +**Implement S3 Lifecycle Policy**: +```typescript +const lifecyclePolicy = { + Rules: [ + { + Id: 'Move old logs to Glacier', + Status: 'Enabled', + Filter: { Prefix: 'logs/' }, + Transitions: [ + { + Days: 30, + StorageClass: 'STANDARD_IA', // Infrequent Access after 30 days + }, + { + Days: 90, + StorageClass: 'GLACIER', // Glacier after 90 days + }, + { + Days: 365, + StorageClass: 'DEEP_ARCHIVE', // Deep Archive after 1 year + }, + ], + Expiration: { + Days: 2555, // Delete after 7 years + }, + }, + { + Id: 'Delete incomplete multipart uploads', + Status: 'Enabled', + AbortIncompleteMultipartUpload: { + DaysAfterInitiation: 7, + }, + }, + ], +}; + +// Apply policy +aws s3api put-bucket-lifecycle-configuration \ + --bucket my-bucket \ + --lifecycle-configuration file://lifecycle-policy.json +``` + +**Delete Orphaned Resources**: +```bash +#!/bin/bash +# Find and delete orphaned EBS snapshots + +echo "Finding orphaned snapshots..." + +# Get all snapshots owned by account +SNAPSHOTS=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots[*].[SnapshotId,Description,VolumeId,StartTime]' --output text) + +# Check each snapshot +while IFS=$'\t' read -r SNAP_ID DESC VOL_ID START_TIME; do + # Check if source volume still exists + if ! aws ec2 describe-volumes --volume-ids "$VOL_ID" &>/dev/null; then + AGE_DAYS=$(( ($(date +%s) - $(date -d "$START_TIME" +%s)) / 86400 )) + + if [ $AGE_DAYS -gt 90 ]; then + echo "Orphaned snapshot: $SNAP_ID (age: $AGE_DAYS days)" + echo " Description: $DESC" + echo " Volume: $VOL_ID (deleted)" + + # Dry run (remove --dry-run to execute) + # aws ec2 delete-snapshot --snapshot-id "$SNAP_ID" + fi + fi +done <<< "$SNAPSHOTS" +``` + +### 4. Serverless Optimization + +**Lambda Memory Optimization**: +```typescript +// AWS Lambda Power Tuning +// Uses AWS Lambda Power Tuning tool to find optimal memory + +interface PowerTuningResult { + functionName: string; + currentConfig: { + memory: number; + avgDuration: number; + avgCost: number; + }; + optimalConfig: { + memory: number; + avgDuration: number; + avgCost: number; + }; + savings: { + costReduction: number; // % + durationReduction: number; // % + monthlySavings: number; // $ + }; +} + +// Example optimization +const result: PowerTuningResult = { + functionName: 'processImage', + currentConfig: { + memory: 1024, // MB + avgDuration: 3200, // ms + avgCost: 0.0000133, // per invocation + }, + optimalConfig: { + memory: 2048, // More memory = faster CPU + avgDuration: 1800, // 44% faster + avgCost: 0.0000119, // 11% cheaper + }, + savings: { + costReduction: 10.5, + durationReduction: 43.8, + monthlySavings: 142, // 1M invocations/month + }, +}; + +// Apply optimization +aws lambda update-function-configuration \ + --function-name processImage \ + --memory-size 2048 +``` + +### 5. Cost Tracking & Validation + +**Pre/Post Optimization Comparison**: +```typescript +interface OptimizationResult { + optimizationId: string; + implementationDate: Date; + resource: string; + action: string; + preOptimization: { + cost: number; + metrics: Record; + }; + postOptimization: { + cost: number; + metrics: Record; + }; + actualSavings: number; + projectedSavings: number; + varianceExplanation: string; +} + +// Track for 30 days post-optimization +async function validateOptimization(optId: string): Promise { + const baseline = await getCostBaseline(optId, 'before'); + const current = await getCostBaseline(optId, 'after'); + + const actualSavings = baseline.cost - current.cost; + const variance = (actualSavings / projectedSavings - 1) * 100; + + return { + optimizationId: optId, + implementationDate: new Date('2025-01-15'), + resource: 'i-1234567890abcdef0', + action: 'Right-size: c5.2xlarge → c5.xlarge', + preOptimization: baseline, + postOptimization: current, + actualSavings, + projectedSavings: 145, + varianceExplanation: variance > 10 + ? 'Higher traffic than baseline period' + : 'Within expected range', + }; +} +``` + +### 6. Automation Scripts + +**Auto-Stop Dev/Test Instances**: +```typescript +// Lambda function to auto-stop instances outside business hours +export async function autoStopDevInstances() { + const now = new Date(); + const hour = now.getHours(); + const day = now.getDay(); + + // Outside business hours (6pm-8am weekdays, all weekend) + const isOffHours = hour < 8 || hour >= 18 || day === 0 || day === 6; + + if (!isOffHours) return; + + // Find running dev/test instances + const instances = await ec2.describeInstances({ + Filters: [ + { Name: 'tag:Environment', Values: ['dev', 'test'] }, + { Name: 'instance-state-name', Values: ['running'] }, + { Name: 'tag:AutoStop', Values: ['true'] }, + ], + }).promise(); + + const instanceIds = instances.Reservations + .flatMap(r => r.Instances || []) + .map(i => i.InstanceId!); + + if (instanceIds.length > 0) { + await ec2.stopInstances({ InstanceIds: instanceIds }).promise(); + console.log(`Stopped ${instanceIds.length} dev/test instances`); + } +} + +// Schedule: Run every hour +// CloudWatch Events: cron(0 * * * ? *) +``` + +### 7. Optimization Dashboard + +**Cost Savings Dashboard**: +```typescript +interface SavingsDashboard { + period: string; + totalSavings: number; + savingsByCategory: { + compute: number; + storage: number; + database: number; + network: number; + other: number; + }; + topOptimizations: Array<{ + description: string; + savings: number; + status: 'completed' | 'in-progress' | 'planned'; + }>; + roi: number; +} + +// Monthly dashboard +const dashboard: SavingsDashboard = { + period: 'January 2025', + totalSavings: 12450, + savingsByCategory: { + compute: 6200, + storage: 1800, + database: 3500, + network: 750, + other: 200, + }, + topOptimizations: [ + { + description: 'Right-sized 32 EC2 instances', + savings: 4100, + status: 'completed', + }, + { + description: 'Purchased 5 RDS Reserved Instances', + savings: 3500, + status: 'completed', + }, + { + description: 'Terminated 15 idle instances', + savings: 2100, + status: 'completed', + }, + ], + roi: 8.5, // Implementation time vs savings +}; +``` + +## Workflow + +1. **Review Recommendations**: Prioritize by savings + effort +2. **Safety Check**: Verify backups, monitoring, approvals +3. **Create Rollback Plan**: Document restore steps +4. **Implement Change**: Execute optimization (staged rollout) +5. **Monitor Impact**: Track metrics for 24-48 hours +6. **Validate Savings**: Compare actual vs projected costs +7. **Document Results**: Update cost tracking dashboard + +## Example Usage + +**User**: "Optimize our over-provisioned EC2 instances" + +**Response**: +- Reviews 32 over-provisioned instances +- Creates safety checklist (backups, monitoring, approvals) +- Generates resize plan with rollback procedures +- Provides automated scripts for off-hours execution +- Sets up post-optimization monitoring +- Projects $4,100/month savings + +## When to Use + +- Implementing cost analysis recommendations +- Emergency budget cuts +- Scheduled optimization sprints +- New architecture deployment +- Post-incident cost spike mitigation + +Optimize cloud costs safely with automated tooling! diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..1564cb5 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,61 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:anton-abyzov/specweave:plugins/specweave-cost-optimizer", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "d3b2557c93d4fc06ddc82a564ffa54ca7758c1d4", + "treeHash": "d099735a3005cce85159dfb7e2b00bd95afae02cec1a85b820912ace41a30077", + "generatedAt": "2025-11-28T10:13:55.179650Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "specweave-cost-optimizer", + "description": "Cloud cost optimization and analysis for AWS, Azure, GCP, and serverless platforms. Provides cost analysis, optimization recommendations, pricing comparisons, budget alerts, and serverless cost modeling with 2024/2025 pricing.", + "version": "0.24.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "55de9f30db126f7b549505d4b91a5d0a6961308a3fee09575cb6cbfbc185dddd" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "157475663b89b94590f11243c599b45ab1d83ffbb0ab8d62a097652323e2c106" + }, + { + "path": "commands/cost-optimize.md", + "sha256": "a8e1a3380ce449dd7912346195cec28d56682aad7b3db003aa8602943cb9eb97" + }, + { + "path": "commands/cost-analyze.md", + "sha256": "bdfe616bd1683074d9481a734468d82191bd45c4128cb7d4f5730cf55428ddfa" + }, + { + "path": "skills/cloud-pricing/SKILL.md", + "sha256": "58e8695cfa64270ecc2f5133114c9c99605f2af0c8d1aeb72ec0c19771ee81d8" + }, + { + "path": "skills/cost-optimization/SKILL.md", + "sha256": "61dfd6b69aa820fc34807790afd8f0608fd98aa218a242098ab0e7a5d59018db" + }, + { + "path": "skills/aws-cost-expert/SKILL.md", + "sha256": "e22ffb310eebb17fcba2dbdb81e229bd52e4a1bba673f12ecd7b3aca6fac2087" + } + ], + "dirSha256": "d099735a3005cce85159dfb7e2b00bd95afae02cec1a85b820912ace41a30077" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/aws-cost-expert/SKILL.md b/skills/aws-cost-expert/SKILL.md new file mode 100644 index 0000000..4024116 --- /dev/null +++ b/skills/aws-cost-expert/SKILL.md @@ -0,0 +1,416 @@ +--- +name: aws-cost-expert +description: Deep AWS cost optimization expertise covering EC2 Reserved Instances, Savings Plans, Spot Instances, Lambda cost optimization, S3 lifecycle policies, RDS Reserved Instances, Cost Explorer, AWS Budgets, Trusted Advisor, Compute Optimizer, Cost Anomaly Detection, and AWS-specific FinOps best practices. Activates for AWS costs, AWS pricing, EC2 costs, Lambda costs, S3 costs, RDS costs, AWS savings plans, AWS reserved instances, AWS spot instances, AWS cost explorer, AWS budgets, reduce AWS bill. +--- + +# AWS Cost Optimization Expert + +Deep expertise in AWS-specific cost optimization strategies and services. + +## AWS Cost Management Services + +### 1. Cost Explorer +```bash +# Get monthly costs by service +aws ce get-cost-and-usage \ + --time-period Start=2025-01-01,End=2025-02-01 \ + --granularity MONTHLY \ + --metrics BlendedCost \ + --group-by Type=SERVICE + +# Get EC2 costs by instance type +aws ce get-cost-and-usage \ + --time-period Start=2025-01-01,End=2025-02-01 \ + --granularity DAILY \ + --metrics UnblendedCost \ + --filter file://ec2-filter.json \ + --group-by Type=INSTANCE_TYPE +``` + +### 2. AWS Budgets +```yaml +Budget Configuration: + - Monthly budget: $30,000 + - Alert at 80% ($24,000) + - Alert at 90% ($27,000) + - Alert at 100% ($30,000) + - Alert at 110% ($33,000) - critical + +Actions: + - Stop non-production instances + - Deny new resource creation + - Email C-level executives +``` + +### 3. Compute Optimizer +```bash +# Get EC2 right-sizing recommendations +aws compute-optimizer get-ec2-instance-recommendations \ + --max-results 100 + +# Get Lambda function recommendations +aws compute-optimizer get-lambda-function-recommendations +``` + +### 4. Trusted Advisor +```bash +# Get cost optimization checks +aws support describe-trusted-advisor-checks \ + --language en \ + --query 'checks[?category==`cost_optimizing`]' + +# Check results +aws support describe-trusted-advisor-check-result \ + --check-id +``` + +## EC2 Cost Optimization + +### Savings Plans vs Reserved Instances +```typescript +interface Comparison { + option: string; + flexibility: string; + discount: string; + commitment: string; + bestFor: string; +} + +const options: Comparison[] = [ + { + option: 'On-Demand', + flexibility: 'Maximum', + discount: '0%', + commitment: 'None', + bestFor: 'Unpredictable workloads', + }, + { + option: 'Spot Instances', + flexibility: 'Medium', + discount: '50-90%', + commitment: 'None', + bestFor: 'Fault-tolerant batch workloads', + }, + { + option: 'Compute Savings Plans', + flexibility: 'High (any instance, any region)', + discount: '30-70%', + commitment: '1 or 3 years', + bestFor: 'Flexible compute usage', + }, + { + option: 'EC2 Instance Savings Plans', + flexibility: 'Medium (same instance family, same region)', + discount: '35-72%', + commitment: '1 or 3 years', + bestFor: 'Consistent instance family usage', + }, + { + option: 'Reserved Instances', + flexibility: 'Low (specific instance type)', + discount: '40-75%', + commitment: '1 or 3 years', + bestFor: 'Predictable, steady-state workloads', + }, +]; +``` + +### Graviton Instances (ARM) +```yaml +Benefits: + - 20% better price/performance vs x86 + - 40% better price/performance for many workloads + - Lower power consumption + +Migration: + - t4g (general purpose, burstable) + - m6g (balanced) + - c6g (compute optimized) + - r6g (memory optimized) + +Compatibility: + - Most Linux distributions + - Container workloads (Docker, ECS, EKS) + - Not for: Windows, x86-only software +``` + +## Lambda Cost Optimization + +### Power Tuning +```typescript +// Use AWS Lambda Power Tuning tool +// https://github.com/alexcasalboni/aws-lambda-power-tuning + +interface PowerTuningResult { + optimalMemory: number; + currentCost: number; + optimalCost: number; + savings: number; +} + +// Example: Image processing function +const result: PowerTuningResult = { + optimalMemory: 2048, // MB + currentCost: 0.0000133, // per invocation at 1024MB + optimalCost: 0.0000119, // per invocation at 2048MB + savings: 10.5, // % (faster execution despite higher memory cost) +}; +``` + +### Lambda Cost Optimization Checklist +```yaml +Memory Optimization: + - ✅ Run power tuning for all production functions + - ✅ Monitor cold start vs warm execution cost + - ✅ Consider provisioned concurrency for latency-sensitive APIs + +Architecture: + - ✅ Avoid VPC Lambda unless necessary (saves NAT costs) + - ✅ Use Lambda Layers for shared dependencies + - ✅ Enable Lambda SnapStart for Java functions (faster cold starts) + +Invocation: + - ✅ Batch process vs streaming (fewer invocations) + - ✅ Async invocation where possible + - ✅ Use Step Functions for orchestration (not nested Lambdas) +``` + +## S3 Cost Optimization + +### Intelligent-Tiering +```yaml +Automatic Cost Optimization: + - Frequent Access tier (default) + - Infrequent Access tier (30 days no access) + - Archive Instant Access (90 days) + - Archive Access (90-730 days, optional) + - Deep Archive Access (180-730 days, optional) + +Monitoring fee: $0.0025 per 1000 objects +Cost: Worth it for > 128KB objects with unpredictable access + +Best for: + - Unknown access patterns + - Data lakes + - Long-term storage with occasional access +``` + +### Lifecycle Policy Example +```json +{ + "Rules": [ + { + "Id": "Optimize application logs", + "Status": "Enabled", + "Filter": { "Prefix": "logs/app/" }, + "Transitions": [ + { "Days": 30, "StorageClass": "STANDARD_IA" }, + { "Days": 90, "StorageClass": "GLACIER_IR" }, + { "Days": 365, "StorageClass": "DEEP_ARCHIVE" } + ], + "Expiration": { "Days": 2555 } + }, + { + "Id": "Delete incomplete multipart uploads", + "Status": "Enabled", + "AbortIncompleteMultipartUpload": { + "DaysAfterInitiation": 7 + } + } + ] +} +``` + +## RDS Cost Optimization + +### Reserved Instance vs Aurora Serverless +```typescript +interface DBCostComparison { + option: string; + monthlyCost: number; + usagePattern: string; + pros: string[]; + cons: string[]; +} + +const comparison: DBCostComparison[] = [ + { + option: 'On-Demand (db.t3.medium)', + monthlyCost: 50, + usagePattern: 'Variable, testing', + pros: ['No commitment', 'Easy to change'], + cons: ['Highest cost'], + }, + { + option: 'Reserved Instance 1yr (db.t3.medium)', + monthlyCost: 32, + usagePattern: 'Steady-state, 24/7', + pros: ['36% savings', 'Predictable cost'], + cons: ['1-year commitment', 'Capacity reserved'], + }, + { + option: 'Aurora Serverless v2', + monthlyCost: 15, + usagePattern: 'Intermittent, dev/test', + pros: ['Auto-scaling', 'Pay per ACU-second', '70% savings for low usage'], + cons: ['Cold start latency', 'Not for steady 24/7'], + }, +]; +``` + +### RDS Storage Optimization +```yaml +Storage Types: + gp2 (General Purpose SSD): + - $0.115/GB/month + - 3 IOPS per GB (min 100, max 16,000) + - Burstable to 3,000 IOPS + + gp3 (Newer General Purpose SSD): + - $0.08/GB/month (30% cheaper!) + - 3,000 IOPS baseline (free) + - 125 MB/s throughput (free) + - Additional IOPS: $0.005 per IOPS/month + - Additional throughput: $0.04 per MB/s/month + + io1/io2 (Provisioned IOPS): + - $0.125/GB + $0.065 per IOPS + - For high-performance databases + +Migration: gp2 → gp3 saves 30% with no performance impact +``` + +## DynamoDB Cost Optimization + +### On-Demand vs Provisioned +```typescript +// Decision matrix +function chooseBillingMode(usage: UsagePattern): string { + const { requestsPerDay, peakTPS, averageTPS, predictability } = usage; + + // On-demand if: + // - Unpredictable traffic + // - Spiky workloads + // - New applications + // - < 20% peak utilization + + if (predictability < 0.5 || (peakTPS / averageTPS) > 2) { + return 'On-Demand'; + } + + // Provisioned if: + // - Predictable traffic + // - Steady-state workloads + // - High utilization (> 20%) + + if (predictability > 0.7 && (peakTPS / averageTPS) < 2) { + return 'Provisioned (with auto-scaling)'; + } + + return 'On-Demand (then migrate to Provisioned after 3 months)'; +} +``` + +### Reserved Capacity +```yaml +Savings: 53-76% discount +Commitment: 1 year +Minimum: 100 WCU or RCU + +Cost Comparison (100 WCU): + - On-Demand: $1.25 per 1M writes = $3,600/month (100 writes/sec) + - Provisioned: 100 WCU * $0.00065/hour * 730 = $47.45/month + - Reserved: $47.45 * 0.47 = $22.30/month + +Best for: Predictable write-heavy workloads +``` + +## Cost Anomaly Detection + +### Setup +```bash +# Create anomaly monitor +aws ce create-anomaly-monitor \ + --anomaly-monitor Name=ProductionMonitor,MonitorType=DIMENSIONAL,MonitorDimension=SERVICE + +# Create anomaly subscription +aws ce create-anomaly-subscription \ + --anomaly-subscription Name=ProductionAlerts,MonitorArnList=arn:aws:ce::123456789012:anomalymonitor/abc123,Subscribers=[{Address=team@example.com,Type=EMAIL}],Threshold=100 +``` + +### Anomaly Patterns +```yaml +Common Anomalies: + - Unexpected EC2 instance launches (compromised credentials) + - Data transfer spikes (DDoS, misconfigured app) + - Lambda invocation explosion (infinite loops) + - S3 GET request flood (hotlinked content) + - RDS storage growth (missing retention policies) + +Alert Thresholds: + - Service cost: > 50% increase from baseline + - Daily spend: > 20% above 7-day average + - Total cost: > 10% above monthly forecast +``` + +## Tagging Strategy for Cost Allocation + +### Tag Policy +```yaml +Required Tags (enforced via AWS Config): + Environment: [prod, staging, dev, test] + Team: [platform, api, frontend, data] + Project: [alpha, beta, gamma] + CostCenter: [engineering, product, sales] + Owner: [email@example.com] + +Auto-Tagging: + - Use AWS Organizations tag policies + - Terraform: default_tags in provider + - CloudFormation: Tags parameter + - Lambda: Environment variables → tags +``` + +### Cost Allocation Tags +```bash +# Activate cost allocation tags +aws ce update-cost-allocation-tags-status \ + --cost-allocation-tags-status TagKey=Environment,Status=Active TagKey=Team,Status=Active + +# View costs by tag +aws ce get-cost-and-usage \ + --time-period Start=2025-01-01,End=2025-02-01 \ + --granularity MONTHLY \ + --metrics BlendedCost \ + --group-by Type=TAG,Key=Environment +``` + +## AWS-Specific Best Practices + +### Multi-Account Strategy +```yaml +Organization Structure: + - Management account (billing only) + - Production account (prod workloads) + - Staging account (pre-prod) + - Development account (dev/test) + - Shared Services account (logging, monitoring) + +Benefits: + - Consolidated billing (volume discounts) + - Reserved Instance sharing across accounts + - Savings Plans apply organization-wide + - Isolated blast radius + - Clear cost attribution +``` + +### AWS Free Tier Monitoring +```bash +# Set up budget for free tier limits +aws budgets create-budget \ + --account-id 123456789012 \ + --budget file://free-tier-budget.json \ + --notifications-with-subscribers file://free-tier-alerts.json +``` + +Optimize AWS costs like a cloud financial engineer! diff --git a/skills/cloud-pricing/SKILL.md b/skills/cloud-pricing/SKILL.md new file mode 100644 index 0000000..0b522c3 --- /dev/null +++ b/skills/cloud-pricing/SKILL.md @@ -0,0 +1,325 @@ +--- +name: cloud-pricing +description: Expert knowledge of cloud provider pricing models for AWS, Azure, GCP covering compute, storage, database, networking, and serverless services. Includes 2025 pricing data, regional differences, free tiers, pricing calculators, and cost comparison across providers. Activates for cloud pricing, how much does it cost, price comparison, AWS vs Azure vs GCP pricing, pricing calculator, estimate costs, regional pricing, free tier, what's cheaper. +--- + +# Cloud Pricing Expert + +Expert knowledge of cloud provider pricing models across AWS, Azure, and GCP with current 2025 pricing data. + +## Compute Pricing (2025) + +### AWS EC2 (us-east-1) +```yaml +t3.micro: $0.0104/hour (2 vCPU, 1GB) +t3.small: $0.0208/hour (2 vCPU, 2GB) +t3.medium: $0.0416/hour (2 vCPU, 4GB) +t3.large: $0.0832/hour (2 vCPU, 8GB) +m5.large: $0.096/hour (2 vCPU, 8GB) +c5.large: $0.085/hour (2 vCPU, 4GB) - compute-optimized +r5.large: $0.126/hour (2 vCPU, 16GB) - memory-optimized + +Spot pricing: 50-90% discount (variable) +Reserved (1yr): 35-40% discount +Reserved (3yr): 60-65% discount +Savings Plans: 30-70% discount (flexible) +``` + +### Azure VMs (East US) +```yaml +B1s: $0.0104/hour (1 vCPU, 1GB) - burstable +B2s: $0.0416/hour (2 vCPU, 4GB) +D2s v5: $0.096/hour (2 vCPU, 8GB) +F2s v2: $0.085/hour (2 vCPU, 4GB) - compute-optimized +E2s v5: $0.126/hour (2 vCPU, 16GB) - memory-optimized + +Spot: 50-90% discount +Reserved (1yr): 40% discount +Reserved (3yr): 62% discount +``` + +### GCP Compute Engine (us-central1) +```yaml +e2-micro: $0.0084/hour (0.25-2 vCPU, 1GB) +e2-small: $0.0168/hour (0.5-2 vCPU, 2GB) +e2-medium: $0.0335/hour (1-2 vCPU, 4GB) +n2-standard-2: $0.0971/hour (2 vCPU, 8GB) +c2-standard-4: $0.2088/hour (4 vCPU, 16GB) - compute + +Preemptible: 60-91% discount +Committed (1yr): 37% discount +Committed (3yr): 55% discount +``` + +## Serverless Pricing + +### AWS Lambda +```yaml +Requests: $0.20 per 1M requests +Compute: + - $0.0000166667 per GB-second + - 128MB, 1s = $0.0000021 + - 1024MB, 1s = $0.0000166667 + +Free tier: 1M requests, 400K GB-seconds/month + +Example: 10M requests, 512MB, 200ms avg + = 10M * $0.20/1M + 10M * 0.5GB * 0.2s * $0.0000166667 + = $2 + $16.67 = $18.67/month +``` + +### Azure Functions +```yaml +Consumption Plan: + - $0.20 per 1M executions + - $0.000016 per GB-second + +Premium Plan (always-on): + - EP1: $0.2065/hour (1 vCPU, 3.5GB) + - EP2: $0.413/hour (2 vCPU, 7GB) + +Free tier: 1M requests, 400K GB-seconds +``` + +### GCP Cloud Functions +```yaml +Invocations: $0.40 per 1M invocations +Compute: $0.0000025 per GB-second +Networking: $0.12/GB egress + +Free tier: 2M invocations, 400K GB-seconds +``` + +## Storage Pricing + +### AWS S3 (us-east-1) +```yaml +Standard: $0.023/GB/month +Standard-IA: $0.0125/GB (54% cheaper, min 128KB, 30 days) +Glacier Instant: $0.004/GB (83% cheaper, min 128KB, 90 days) +Glacier Flexible: $0.0036/GB (84% cheaper, 90 days, 1-5min retrieval) +Deep Archive: $0.00099/GB (96% cheaper, 180 days, 12hr retrieval) + +Requests: + - PUT/COPY/POST: $0.005 per 1K requests + - GET/SELECT: $0.0004 per 1K requests + +Data Transfer: $0.09/GB out to internet (first 10TB) +``` + +### Azure Blob Storage +```yaml +Hot: $0.0184/GB (frequent access) +Cool: $0.01/GB (min 30 days) +Archive: $0.00099/GB (min 180 days, 15hr retrieval) + +Transactions: + - Write: $0.05 per 10K + - Read: $0.004 per 10K +``` + +### GCP Cloud Storage +```yaml +Standard: $0.020/GB +Nearline: $0.010/GB (min 30 days) +Coldline: $0.004/GB (min 90 days) +Archive: $0.0012/GB (min 365 days) + +Operations: $0.05 per 10K Class A (write) + $0.004 per 10K Class B (read) +``` + +## Database Pricing + +### AWS RDS PostgreSQL (db.t3.medium) +```yaml +On-demand: $0.068/hour ($49.64/month) +Reserved 1yr: $0.043/hour (37% savings) +Reserved 3yr: $0.029/hour (57% savings) + +Storage (gp3): $0.115/GB/month +Backup: $0.095/GB/month + +Aurora Serverless: $0.12 per ACU-hour (auto-scaling) +``` + +### Azure SQL Database +```yaml +General Purpose (2 vCore): + - Provisioned: $0.5556/hour ($406/month) + - Serverless: $0.75-1.50/vCore-hour (auto-pause) + +Storage: $0.115/GB/month +``` + +### GCP Cloud SQL PostgreSQL +```yaml +db-n1-standard-1 (1 vCPU, 3.75GB): + - On-demand: $0.0413/hour ($30.15/month) + - Committed 1yr: 37% discount + +Storage (SSD): $0.17/GB/month +``` + +### DynamoDB / Cosmos DB / Firestore +```yaml +DynamoDB (us-east-1): + - On-demand: $1.25 per 1M read, $1.25 per 1M write + - Provisioned: $0.00065/hour per RCU, $0.00065/hour per WCU + - Storage: $0.25/GB + +Cosmos DB: + - Provisioned: $0.008/hour per 100 RU/s + - Serverless: $0.25 per 1M RU + +Firestore: + - Reads: $0.06 per 100K + - Writes: $0.18 per 100K + - Storage: $0.18/GB +``` + +## Networking Pricing + +### Data Transfer (AWS, per GB) +```yaml +Internet egress (us-east-1): + - First 10TB: $0.09/GB + - 10-50TB: $0.085/GB + - 50-150TB: $0.070/GB + +Cross-region: $0.02/GB +Same AZ: Free +VPC peering: $0.01/GB + +NAT Gateway: + - $0.045/hour + - $0.045/GB processed +``` + +### CDN Pricing +```yaml +CloudFront (per GB): + - First 10TB: $0.085/GB + - 10-50TB: $0.080/GB + +Azure CDN: + - First 10TB: $0.081/GB + +Cloud CDN: + - First 10TB: $0.085/GB +``` + +## Price Comparison Examples + +### Example 1: Simple Web Application +```typescript +const requirements = { + compute: '2 x t3.medium (24/7)', + storage: '100GB SSD', + database: 'PostgreSQL (db.t3.medium)', + traffic: '1TB/month egress', +}; + +const costs = { + aws: { + ec2: 2 * 0.0416 * 730 = 60.74, + ebs: 100 * 0.10 = 10, + rds: 49.64 + (20 * 0.115) = 51.94, + transfer: 1000 * 0.09 = 90, + total: 212.68, + }, + azure: { + vm: 2 * 0.0416 * 730 = 60.74, + disk: 100 * 0.048 = 4.80, + sql: 406, // Managed SQL more expensive + transfer: 1000 * 0.087 = 87, + total: 558.54, + }, + gcp: { + compute: 2 * 0.0335 * 730 = 48.91, + disk: 100 * 0.17 = 17, + sql: 30.15 + (20 * 0.17) = 33.55, + transfer: 1000 * 0.12 = 120, + total: 219.46, + }, +}; + +// Winner: AWS ($212.68/month) +// With Reserved Instances (1yr): $140/month (34% savings) +``` + +### Example 2: Serverless API +```typescript +const requirements = { + requests: '50M/month', + avgDuration: '200ms', + avgMemory: '512MB', +}; + +const lambda = { + requests: 50 * 0.20 = 10, + compute: 50e6 * 0.5 * 0.2 * 0.0000166667 = 83.33, + total: 93.33, +}; + +const azureFunctions = { + executions: 50 * 0.20 = 10, + compute: 50e6 * 0.5 * 0.2 * 0.000016 = 80, + total: 90, +}; + +const cloudFunctions = { + invocations: 50 * 0.40 = 20, + compute: 50e6 * 0.5 * 0.2 * 0.0000025 = 12.5, + networking: 1000 * 0.12 = 120, // 1TB egress + total: 152.50, +}; + +// Winner: Azure Functions ($90/month) +``` + +## Free Tiers (Always Free, 2025) + +### AWS +```yaml +EC2: 750 hours/month t2.micro (1 year) +Lambda: 1M requests, 400K GB-seconds +S3: 5GB storage (12 months) +DynamoDB: 25GB storage, 25 read/write units +RDS: 750 hours db.t2.micro (12 months) +CloudFront: 1TB transfer (12 months) +``` + +### Azure +```yaml +App Service: 10 web apps +Functions: 1M requests/month +Blob Storage: 5GB (12 months) +Cosmos DB: 1000 RU/s, 25GB +SQL Database: 100K vCore-seconds (12 months) +``` + +### GCP +```yaml +Compute: 1 f1-micro instance (744 hours/month) +Cloud Functions: 2M invocations, 400K GB-seconds +Cloud Storage: 5GB standard +Cloud Run: 2M requests, 360K GB-seconds +Firestore: 1GB storage, 50K reads, 20K writes +``` + +## Pricing Calculators + +**AWS Pricing Calculator**: https://calculator.aws +**Azure Pricing Calculator**: https://azure.microsoft.com/pricing/calculator +**GCP Pricing Calculator**: https://cloud.google.com/products/calculator + +## Regional Pricing Differences + +**Most Expensive**: Asia Pacific (Tokyo, Sydney) +**Cheapest**: US regions (us-east-1, us-west-2) +**Middle**: Europe (eu-west-1, eu-central-1) + +Difference: 10-30% higher in APAC vs US East + +Make informed pricing decisions with up-to-date cost data! diff --git a/skills/cost-optimization/SKILL.md b/skills/cost-optimization/SKILL.md new file mode 100644 index 0000000..edf7428 --- /dev/null +++ b/skills/cost-optimization/SKILL.md @@ -0,0 +1,337 @@ +--- +name: cost-optimization +description: Expert cloud cost optimization strategies for AWS, Azure, GCP, and serverless platforms. Covers FinOps principles, right-sizing, reserved instances, savings plans, spot instances, storage optimization, database cost reduction, serverless cost modeling, budget management, cost allocation, chargeback models, and continuous cost optimization. Activates for cost optimization, cloud costs, reduce costs, save money, finops, cost analysis, budget overrun, expensive cloud bill, cost savings, reserved instances, spot instances, savings plans, right-sizing, cost allocation tags, chargeback, showback. +--- + +# Cloud Cost Optimization Expert + +You are an expert FinOps engineer specializing in cloud cost optimization across AWS, Azure, and GCP with deep knowledge of 2024/2025 pricing models and optimization strategies. + +## Core Expertise + +### 1. FinOps Principles + +**Foundation**: +- Visibility: Centralized cost reporting +- Optimization: Continuous improvement +- Accountability: Team ownership +- Forecasting: Predictive budgeting + +**FinOps Phases**: +1. **Inform**: Visibility, allocation, benchmarking +2. **Optimize**: Right-sizing, commitment discounts, waste reduction +3. **Operate**: Continuous automation, governance + +### 2. Compute Cost Optimization + +**EC2/VM/Compute Engine**: +- Right-sizing (CPU, memory, network utilization analysis) +- Reserved Instances (1-year, 3-year commitments, 30-70% savings) +- Savings Plans (compute, EC2, flexible commitments) +- Spot/Preemptible Instances (50-90% discounts for fault-tolerant workloads) +- Auto-scaling groups (scale to demand) +- Graviton/Ampere processors (20-40% price-performance improvement) + +**Container Optimization**: +- ECS/EKS/AKS/GKE: Fargate vs EC2 cost comparison +- Kubernetes: Pod autoscaling (HPA, VPA, KEDA) +- Spot nodes for batch workloads +- Right-size pod resource requests/limits + +### 3. Serverless Cost Optimization + +**AWS Lambda / Azure Functions / Cloud Functions**: +```typescript +// Memory optimization (more memory = faster CPU = potentially cheaper) +const optimization = { + function: 'imageProcessor', + currentConfig: { memory: 512, duration: 5000, cost: 0.00001667 }, + optimalConfig: { memory: 1024, duration: 2800, cost: 0.00001456 }, + savings: 12.6, // % per invocation +}; + +// Optimization strategies +- Memory tuning (128MB - 10GB) +- Provisioned concurrency vs on-demand (predictable latency) +- Duration optimization (faster code = cheaper) +- Avoid VPC Lambda unless needed (NAT costs) +- Use Lambda SnapStart (Java) or container reuse +- Batch processing vs streaming +``` + +**API Gateway / App Gateway**: +- HTTP API vs REST API (70% cheaper) +- Caching responses (reduce backend invocations) +- Request throttling + +### 4. Storage Cost Optimization + +**S3 / Blob Storage / Cloud Storage**: +```yaml +Lifecycle Policies: + - Standard (frequent access): $0.023/GB/month + - Infrequent Access: $0.0125/GB (54% cheaper, min 30 days) + - Glacier Instant Retrieval: $0.004/GB (83% cheaper) + - Glacier Flexible: $0.0036/GB (84% cheaper, 1-5min retrieval) + - Deep Archive: $0.00099/GB (96% cheaper, 12hr retrieval) + +Optimization: + - Auto-transition to IA after 30 days + - Archive logs to Glacier after 90 days + - Deep Archive compliance data after 1 year + - Delete old data (7-year retention) + - Intelligent-Tiering for unpredictable access +``` + +**EBS / Managed Disks / Persistent Disk**: +- gp3 vs gp2 (20% cheaper, 20% faster baseline) +- Snapshot lifecycle management (delete old AMIs) +- Resize volumes (no over-provisioning) +- Throughput optimization (gp3 customizable) + +### 5. Database Cost Optimization + +**RDS / SQL Database / Cloud SQL**: +```typescript +const optimizations = [ + { + strategy: 'Reserved Instances', + savings: '35-65%', + commitment: '1 or 3 years', + }, + { + strategy: 'Right-size instance', + savings: '30-50%', + action: 'Monitor CPU, IOPS, connections', + }, + { + strategy: 'Aurora Serverless', + savings: '90% for intermittent workloads', + useCases: ['Dev/test', 'Seasonal apps'], + }, + { + strategy: 'Read replicas', + savings: 'Offload reads, smaller primary', + useCases: ['Analytics', 'Reporting'], + }, +]; +``` + +**DynamoDB / Cosmos DB / Firestore**: +- On-demand vs provisioned (predictable traffic = provisioned) +- Reserved capacity (1-year commitment, 50% savings) +- TTL for automatic data deletion +- Sparse indexes (reduce storage) + +### 6. Networking Cost Optimization + +**Data Transfer**: +```yaml +Costs (AWS us-east-1): + - Internet egress: $0.09/GB (first 10TB) + - Inter-region: $0.02/GB + - Same AZ: Free + - VPC peering: $0.01/GB + - NAT Gateway: $0.045/GB + $0.045/hour + +Optimization: + - Use CloudFront/CDN (caching reduces origin requests) + - Same-region architecture (avoid cross-region) + - VPC endpoints for AWS services (no NAT costs) + - Direct Connect for high-volume transfers + - Compress data before transfer +``` + +### 7. Cost Allocation & Tagging + +**Tagging Strategy**: +```yaml +required_tags: + Environment: [prod, staging, dev] + Team: [platform, api, frontend] + Project: [alpha, beta] + CostCenter: [engineering, product] + Owner: [email] + +enforcement: + - AWS Config rules (deny untagged resources) + - Terraform validation + - Monthly untagged resource report +``` + +**Chargeback Model**: +```typescript +interface Chargeback { + team: string; + month: string; + costs: { + compute: number; + storage: number; + network: number; + database: number; + }; + budget: number; + variance: number; // % + recommendations: string[]; +} + +// Show-back (informational) vs Chargeback (actual billing) +``` + +### 8. Savings Plans & Commitments + +**AWS Savings Plans**: +- Compute Savings Plans (most flexible, EC2 + Fargate + Lambda) +- EC2 Instance Savings Plans (specific instance family) +- SageMaker Savings Plans + +**Azure Reserved Instances**: +- VM Reserved Instances +- SQL Database reserved capacity +- Cosmos DB reserved capacity + +**GCP Committed Use Discounts**: +- Compute Engine CUDs (1-year, 3-year) +- Cloud SQL commitments + +**Decision Matrix**: +```typescript +// When to use Reserved Instances vs Savings Plans +const decision = (usage: UsagePattern) => { + if (usage.consistency > 70 && usage.predictable) { + return 'Reserved Instances'; // Max savings, no flexibility + } else if (usage.consistency > 50 && usage.variesByType) { + return 'Savings Plans'; // Good savings, flexible + } else { + return 'On-demand + Spot'; // Unpredictable workloads + } +}; +``` + +### 9. Cost Anomaly Detection + +**Alert Thresholds**: +```yaml +anomaly_detection: + - metric: daily_cost + threshold: 20% # Alert if 20% above baseline + baseline: 7-day rolling average + + - metric: service_cost + threshold: 50% # Alert if service cost spikes + baseline: Previous month + +budgets: + - name: Production + limit: 30000 + alerts: [80%, 90%, 100%] +``` + +### 10. Continuous Optimization + +**Monthly Cadence**: +```markdown +Week 1: Cost Review +- Compare to budget +- Identify anomalies +- Tag compliance check + +Week 2: Optimization Planning +- Review right-sizing recommendations +- Evaluate RI/SP coverage +- Identify waste (idle resources) + +Week 3: Implementation +- Execute approved optimizations +- Purchase commitments +- Clean up waste + +Week 4: Validation +- Measure savings +- Update forecasts +- Report to stakeholders +``` + +## Best Practices + +### Quick Wins (Immediate Savings) + +1. **Terminate Idle Resources**: 5-15% savings + - Stopped instances older than 7 days + - Unattached EBS volumes + - Unused Load Balancers + - Old snapshots/AMIs + +2. **Right-size Over-provisioned**: 15-30% savings + - Instances with < 20% CPU utilization + - Over-provisioned memory + - Excessive IOPS + +3. **Storage Lifecycle**: 20-50% savings + - S3/Blob lifecycle policies + - Delete old logs/backups + - Compress data + +4. **Reserved Instance Coverage**: 30-70% savings + - Purchase for steady-state workloads + - Start with 1-year commitments + - Analyze 3-month usage trends + +### Architecture Patterns for Cost + +**Serverless-First**: +- No idle costs (pay per use) +- Auto-scaling included +- Best for: APIs, ETL, event processing + +**Spot/Preemptible for Batch**: +- 50-90% discounts +- Best for: CI/CD, data processing, ML training + +**Multi-tier Storage**: +- Hot (frequently accessed) → Standard +- Warm (occasional) → IA/Cool +- Cold (archive) → Glacier/Archive + +### Common Mistakes + +❌ **Don't**: +- Over-provision "just in case" +- Ignore tagging discipline +- Purchase 3-year RIs without analysis +- Run production 24/7 without auto-scaling +- Store all data in highest-cost tier + +✅ **Do**: +- Monitor and right-size continuously +- Tag everything for cost allocation +- Start with 1-year commitments +- Use auto-scaling + schedule-based scaling +- Implement storage lifecycle policies + +## Tools & Resources + +**AWS**: +- Cost Explorer (historical analysis) +- Compute Optimizer (right-sizing) +- Trusted Advisor (best practices) +- Cost Anomaly Detection + +**Azure**: +- Cost Management + Billing +- Azure Advisor (recommendations) +- Azure Pricing Calculator + +**GCP**: +- Cloud Billing Reports +- Recommender (optimization suggestions) +- Active Assist + +**Third-party**: +- CloudHealth, CloudCheckr (multi-cloud) +- Spot.io (spot instance management) +- Vantage, CloudZero (cost visibility) + +**Calculate ROI**: Savings vs engineer time spent optimizing + +You are ready to optimize cloud costs like a FinOps expert!