From 0364d81c91fb07d84e31d3c5b24a8b3de27f6315 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 09:08:44 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + plugin.lock.json | 52 ++ skills/aws-cost-operations/SKILL.md | 299 +++++++++ .../references/cloudwatch-alarms.md | 567 ++++++++++++++++++ .../references/operations-patterns.md | 394 ++++++++++++ 6 files changed, 1327 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/aws-cost-operations/SKILL.md create mode 100644 skills/aws-cost-operations/references/cloudwatch-alarms.md create mode 100644 skills/aws-cost-operations/references/operations-patterns.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..891fbfc --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "aws-cost-ops", + "description": "AWS cost optimization, monitoring, and operational excellence with integrated MCP servers for billing, cost analysis, observability, and security assessment", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Kane Zhu", + "email": "me@kane.mx" + }, + "skills": [ + "./skills/aws-cost-operations" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2939cf8 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# aws-cost-ops + +AWS cost optimization, monitoring, and operational excellence with integrated MCP servers for billing, cost analysis, observability, and security assessment diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..a3b3212 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,52 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:zxkane/aws-skills:aws-cost-ops", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "887350003774c36f1d1919832e6c04a03386a5d9", + "treeHash": "a607490194e1e935e41ec4921f3eff59f68228cd962a078b40d13b223ba4ea72", + "generatedAt": "2025-11-28T10:29:13.942117Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "aws-cost-ops", + "description": "AWS cost optimization, monitoring, and operational excellence with integrated MCP servers for billing, cost analysis, observability, and security assessment" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "38595cff0ef565da377389a3dfcee6ca01e4a536bc06801ddc8d27f879d422aa" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "6505b55daf1c2bd414b370fcfb13f17e4715a0c7fa154506a636ac67957a688e" + }, + { + "path": "skills/aws-cost-operations/SKILL.md", + "sha256": "4bf11ee2db54ecc9372bcb57d54d3e22bfabe7c39907362dc22fc8deabf38852" + }, + { + "path": "skills/aws-cost-operations/references/operations-patterns.md", + "sha256": "9a9e081830695224db93d7b0cab8462d479d33c50c18e3740ef1848f1e051486" + }, + { + "path": "skills/aws-cost-operations/references/cloudwatch-alarms.md", + "sha256": "046a77894379a0b0585109fbcc397509702ff270700813ad46234896dbe2d894" + } + ], + "dirSha256": "a607490194e1e935e41ec4921f3eff59f68228cd962a078b40d13b223ba4ea72" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/aws-cost-operations/SKILL.md b/skills/aws-cost-operations/SKILL.md new file mode 100644 index 0000000..1de923b --- /dev/null +++ b/skills/aws-cost-operations/SKILL.md @@ -0,0 +1,299 @@ +--- +name: aws-cost-operations +description: This skill provides AWS cost optimization, monitoring, and operational best practices with integrated MCP servers for billing analysis, cost estimation, observability, and security assessment. +--- + +# AWS Cost & Operations + +This skill provides comprehensive guidance for AWS cost optimization, monitoring, observability, and operational excellence with integrated MCP servers. + +## Integrated MCP Servers + +This skill includes 8 MCP servers automatically configured with the plugin: + +### Cost Management Servers + +#### 1. AWS Billing and Cost Management MCP Server +**Purpose**: Real-time billing and cost management +- View current AWS spending and trends +- Analyze billing details across services +- Track budget utilization +- Monitor cost allocation tags +- Review consolidated billing for organizations + +#### 2. AWS Pricing MCP Server +**Purpose**: Pre-deployment cost estimation and optimization +- Estimate costs before deploying resources +- Compare pricing across regions +- Calculate Total Cost of Ownership (TCO) +- Evaluate different service options for cost efficiency +- Get current pricing information for AWS services + +#### 3. AWS Cost Explorer MCP Server +**Purpose**: Detailed cost analysis and reporting +- Analyze historical spending patterns +- Create custom cost reports +- Identify cost anomalies and trends +- Forecast future costs +- Analyze cost by service, region, or tag +- Generate cost optimization recommendations + +### Monitoring & Observability Servers + +#### 4. Amazon CloudWatch MCP Server +**Purpose**: Metrics, alarms, and logs analysis +- Query CloudWatch metrics and logs +- Create and manage CloudWatch alarms +- Analyze application performance metrics +- Troubleshoot operational issues +- Set up custom dashboards +- Monitor resource utilization + +#### 5. Amazon CloudWatch Application Signals MCP Server +**Purpose**: Application monitoring and performance insights +- Monitor application health and performance +- Analyze service-level objectives (SLOs) +- Track application dependencies +- Identify performance bottlenecks +- Monitor service map and traces + +#### 6. AWS Managed Prometheus MCP Server +**Purpose**: Prometheus-compatible monitoring +- Query Prometheus metrics +- Monitor containerized applications +- Analyze Kubernetes workload metrics +- Create PromQL queries +- Track custom application metrics + +### Audit & Security Servers + +#### 7. AWS CloudTrail MCP Server +**Purpose**: AWS API activity and audit analysis +- Analyze AWS API calls and user activity +- Track resource changes and modifications +- Investigate security incidents +- Audit compliance requirements +- Identify unusual access patterns +- Review who made what changes when + +#### 8. AWS Well-Architected Security Assessment Tool MCP Server +**Purpose**: Security assessment against Well-Architected Framework +- Assess security posture against AWS best practices +- Identify security gaps and vulnerabilities +- Get security improvement recommendations +- Review security pillar compliance +- Generate security assessment reports + +## When to Use This Skill + +Use this skill when: +- Optimizing AWS costs and reducing spending +- Estimating costs before deployment +- Monitoring application and infrastructure performance +- Setting up observability and alerting +- Analyzing spending patterns and trends +- Investigating operational issues +- Auditing AWS activity and changes +- Assessing security posture +- Implementing operational excellence + +## Cost Optimization Best Practices + +### Pre-Deployment Cost Estimation + +**Always estimate costs before deploying**: +1. Use **AWS Pricing MCP** to estimate resource costs +2. Compare pricing across different regions +3. Evaluate alternative service options +4. Calculate expected monthly costs +5. Plan for scaling and growth + +**Example workflow**: +``` +"Estimate the monthly cost of running a Lambda function with +1 million invocations, 512MB memory, 3-second duration in us-east-1" +``` + +### Cost Analysis and Optimization + +**Regular cost reviews**: +1. Use **Cost Explorer MCP** to analyze spending trends +2. Identify cost anomalies and unexpected charges +3. Review costs by service, region, and environment +4. Compare actual vs. budgeted costs +5. Generate cost optimization recommendations + +**Cost optimization strategies**: +- Right-size over-provisioned resources +- Use appropriate storage classes (S3, EBS) +- Implement auto-scaling for dynamic workloads +- Leverage Savings Plans and Reserved Instances +- Delete unused resources and snapshots +- Use cost allocation tags effectively + +### Budget Monitoring + +**Track spending against budgets**: +1. Use **Billing and Cost Management MCP** to monitor budgets +2. Set up budget alerts for threshold breaches +3. Review budget utilization regularly +4. Adjust budgets based on trends +5. Implement cost controls and governance + +## Monitoring and Observability Best Practices + +### CloudWatch Metrics and Alarms + +**Implement comprehensive monitoring**: +1. Use **CloudWatch MCP** to query metrics and logs +2. Set up alarms for critical metrics: + - CPU and memory utilization + - Error rates and latency + - Queue depths and processing times + - API gateway throttling + - Lambda errors and timeouts +3. Create CloudWatch dashboards for visualization +4. Use log insights for troubleshooting + +**Example alarm scenarios**: +- Lambda error rate > 1% +- EC2 CPU utilization > 80% +- API Gateway 4xx/5xx error spike +- DynamoDB throttled requests +- ECS task failures + +### Application Performance Monitoring + +**Monitor application health**: +1. Use **CloudWatch Application Signals MCP** for APM +2. Track service-level objectives (SLOs) +3. Monitor application dependencies +4. Identify performance bottlenecks +5. Set up distributed tracing + +### Container and Kubernetes Monitoring + +**For containerized workloads**: +1. Use **AWS Managed Prometheus MCP** for metrics +2. Monitor container resource utilization +3. Track pod and node health +4. Create PromQL queries for custom metrics +5. Set up alerts for container anomalies + +## Audit and Security Best Practices + +### CloudTrail Activity Analysis + +**Audit AWS activity**: +1. Use **CloudTrail MCP** to analyze API activity +2. Track who made changes to resources +3. Investigate security incidents +4. Monitor for suspicious activity patterns +5. Audit compliance with policies + +**Common audit scenarios**: +- "Who deleted this S3 bucket?" +- "Show all IAM role changes in the last 24 hours" +- "List failed login attempts" +- "Find all actions by a specific user" +- "Track modifications to security groups" + +### Security Assessment + +**Regular security reviews**: +1. Use **Well-Architected Security Assessment MCP** +2. Assess security posture against best practices +3. Identify security gaps and vulnerabilities +4. Implement recommended security improvements +5. Document security compliance + +**Security assessment areas**: +- Identity and Access Management (IAM) +- Detective controls and monitoring +- Infrastructure protection +- Data protection and encryption +- Incident response preparedness + +## Using MCP Servers Effectively + +### Cost Analysis Workflow + +1. **Pre-deployment**: Use Pricing MCP to estimate costs +2. **Post-deployment**: Use Billing MCP to track actual spending +3. **Analysis**: Use Cost Explorer MCP for detailed cost analysis +4. **Optimization**: Implement recommendations from Cost Explorer + +### Monitoring Workflow + +1. **Setup**: Configure CloudWatch metrics and alarms +2. **Monitor**: Use CloudWatch MCP to track key metrics +3. **Analyze**: Use Application Signals for APM insights +4. **Troubleshoot**: Query CloudWatch Logs for issue resolution + +### Security Workflow + +1. **Audit**: Use CloudTrail MCP to review activity +2. **Assess**: Use Well-Architected Security Assessment +3. **Remediate**: Implement security recommendations +4. **Monitor**: Track security events via CloudWatch + +### MCP Usage Best Practices + +1. **Cost Awareness**: Check pricing before deploying resources +2. **Proactive Monitoring**: Set up alarms for critical metrics +3. **Regular Reviews**: Analyze costs and performance weekly +4. **Audit Trails**: Review CloudTrail logs for compliance +5. **Security First**: Run security assessments regularly +6. **Optimize Continuously**: Act on cost and performance recommendations + +## Operational Excellence Guidelines + +### Cost Optimization + +- **Tag Everything**: Use consistent cost allocation tags +- **Review Monthly**: Analyze spending trends and anomalies +- **Right-size**: Match resources to actual usage +- **Automate**: Use auto-scaling and scheduling +- **Monitor Budgets**: Set alerts for cost overruns + +### Monitoring and Alerting + +- **Critical Metrics**: Alert on business-critical metrics +- **Noise Reduction**: Fine-tune thresholds to reduce false positives +- **Actionable Alerts**: Ensure alerts have clear remediation steps +- **Dashboard Visibility**: Create dashboards for key stakeholders +- **Log Retention**: Balance cost and compliance needs + +### Security and Compliance + +- **Least Privilege**: Grant minimum required permissions +- **Audit Regularly**: Review CloudTrail logs for anomalies +- **Encrypt Data**: Use encryption at rest and in transit +- **Assess Continuously**: Run security assessments frequently +- **Incident Response**: Have procedures for security events + +## Additional Resources + +For detailed operational patterns and best practices, refer to the comprehensive reference: + +**File**: `references/operations-patterns.md` + +This reference includes: +- Cost optimization strategies +- Monitoring and alerting patterns +- Observability best practices +- Security and compliance guidelines +- Troubleshooting workflows + +## CloudWatch Alarms Reference + +**File**: `references/cloudwatch-alarms.md` + +Common alarm configurations for: +- Lambda functions +- EC2 instances +- RDS databases +- DynamoDB tables +- API Gateway +- ECS services +- Application Load Balancers diff --git a/skills/aws-cost-operations/references/cloudwatch-alarms.md b/skills/aws-cost-operations/references/cloudwatch-alarms.md new file mode 100644 index 0000000..2bd53e0 --- /dev/null +++ b/skills/aws-cost-operations/references/cloudwatch-alarms.md @@ -0,0 +1,567 @@ +# CloudWatch Alarms Reference + +Common CloudWatch alarm configurations for AWS services. + +## Lambda Functions + +### Error Rate Alarm +```typescript +new cloudwatch.Alarm(this, 'LambdaErrorAlarm', { + metric: lambdaFunction.metricErrors({ + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 10, + evaluationPeriods: 1, + treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING, + alarmDescription: 'Lambda error count exceeded threshold', +}); +``` + +### Duration Alarm (Approaching Timeout) +```typescript +new cloudwatch.Alarm(this, 'LambdaDurationAlarm', { + metric: lambdaFunction.metricDuration({ + statistic: 'Maximum', + period: Duration.minutes(5), + }), + threshold: lambdaFunction.timeout.toMilliseconds() * 0.8, // 80% of timeout + evaluationPeriods: 2, + alarmDescription: 'Lambda duration approaching timeout', +}); +``` + +### Throttle Alarm +```typescript +new cloudwatch.Alarm(this, 'LambdaThrottleAlarm', { + metric: lambdaFunction.metricThrottles({ + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 5, + evaluationPeriods: 1, + alarmDescription: 'Lambda function is being throttled', +}); +``` + +### Concurrent Executions Alarm +```typescript +new cloudwatch.Alarm(this, 'LambdaConcurrencyAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/Lambda', + metricName: 'ConcurrentExecutions', + dimensionsMap: { + FunctionName: lambdaFunction.functionName, + }, + statistic: 'Maximum', + period: Duration.minutes(1), + }), + threshold: 100, // Adjust based on reserved concurrency + evaluationPeriods: 2, + alarmDescription: 'Lambda concurrent executions high', +}); +``` + +## API Gateway + +### 5XX Error Rate Alarm +```typescript +new cloudwatch.Alarm(this, 'Api5xxAlarm', { + metric: api.metricServerError({ + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 10, + evaluationPeriods: 1, + alarmDescription: 'API Gateway 5XX errors exceeded threshold', +}); +``` + +### 4XX Error Rate Alarm +```typescript +new cloudwatch.Alarm(this, 'Api4xxAlarm', { + metric: api.metricClientError({ + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 50, + evaluationPeriods: 2, + alarmDescription: 'API Gateway 4XX errors exceeded threshold', +}); +``` + +### Latency Alarm +```typescript +new cloudwatch.Alarm(this, 'ApiLatencyAlarm', { + metric: api.metricLatency({ + statistic: 'p99', + period: Duration.minutes(5), + }), + threshold: 2000, // 2 seconds + evaluationPeriods: 2, + alarmDescription: 'API Gateway p99 latency exceeded threshold', +}); +``` + +## DynamoDB + +### Read Throttle Alarm +```typescript +new cloudwatch.Alarm(this, 'DynamoDBReadThrottleAlarm', { + metric: table.metricUserErrors({ + dimensions: { + Operation: 'GetItem', + }, + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 5, + evaluationPeriods: 1, + alarmDescription: 'DynamoDB read operations being throttled', +}); +``` + +### Write Throttle Alarm +```typescript +new cloudwatch.Alarm(this, 'DynamoDBWriteThrottleAlarm', { + metric: table.metricUserErrors({ + dimensions: { + Operation: 'PutItem', + }, + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 5, + evaluationPeriods: 1, + alarmDescription: 'DynamoDB write operations being throttled', +}); +``` + +### Consumed Capacity Alarm +```typescript +new cloudwatch.Alarm(this, 'DynamoDBCapacityAlarm', { + metric: table.metricConsumedReadCapacityUnits({ + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: provisionedCapacity * 0.8, // 80% of provisioned + evaluationPeriods: 2, + alarmDescription: 'DynamoDB consumed capacity approaching limit', +}); +``` + +## EC2 Instances + +### CPU Utilization Alarm +```typescript +new cloudwatch.Alarm(this, 'EC2CpuAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/EC2', + metricName: 'CPUUtilization', + dimensionsMap: { + InstanceId: instance.instanceId, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 80, + evaluationPeriods: 3, + alarmDescription: 'EC2 CPU utilization high', +}); +``` + +### Status Check Failed Alarm +```typescript +new cloudwatch.Alarm(this, 'EC2StatusCheckAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/EC2', + metricName: 'StatusCheckFailed', + dimensionsMap: { + InstanceId: instance.instanceId, + }, + statistic: 'Maximum', + period: Duration.minutes(1), + }), + threshold: 1, + evaluationPeriods: 2, + alarmDescription: 'EC2 status check failed', +}); +``` + +### Disk Space Alarm (Requires CloudWatch Agent) +```typescript +new cloudwatch.Alarm(this, 'EC2DiskAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'CWAgent', + metricName: 'disk_used_percent', + dimensionsMap: { + InstanceId: instance.instanceId, + path: '/', + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 85, + evaluationPeriods: 2, + alarmDescription: 'EC2 disk space usage high', +}); +``` + +## RDS Databases + +### CPU Alarm +```typescript +new cloudwatch.Alarm(this, 'RDSCpuAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/RDS', + metricName: 'CPUUtilization', + dimensionsMap: { + DBInstanceIdentifier: dbInstance.instanceIdentifier, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 80, + evaluationPeriods: 3, + alarmDescription: 'RDS CPU utilization high', +}); +``` + +### Connection Count Alarm +```typescript +new cloudwatch.Alarm(this, 'RDSConnectionAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/RDS', + metricName: 'DatabaseConnections', + dimensionsMap: { + DBInstanceIdentifier: dbInstance.instanceIdentifier, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: maxConnections * 0.8, // 80% of max connections + evaluationPeriods: 2, + alarmDescription: 'RDS connection count approaching limit', +}); +``` + +### Free Storage Space Alarm +```typescript +new cloudwatch.Alarm(this, 'RDSStorageAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/RDS', + metricName: 'FreeStorageSpace', + dimensionsMap: { + DBInstanceIdentifier: dbInstance.instanceIdentifier, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 10 * 1024 * 1024 * 1024, // 10 GB in bytes + comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, + evaluationPeriods: 1, + alarmDescription: 'RDS free storage space low', +}); +``` + +## ECS Services + +### Task Count Alarm +```typescript +new cloudwatch.Alarm(this, 'ECSTaskCountAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'ECS/ContainerInsights', + metricName: 'RunningTaskCount', + dimensionsMap: { + ServiceName: service.serviceName, + ClusterName: cluster.clusterName, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 1, + comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, + evaluationPeriods: 2, + alarmDescription: 'ECS service has no running tasks', +}); +``` + +### CPU Utilization Alarm +```typescript +new cloudwatch.Alarm(this, 'ECSCpuAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/ECS', + metricName: 'CPUUtilization', + dimensionsMap: { + ServiceName: service.serviceName, + ClusterName: cluster.clusterName, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 80, + evaluationPeriods: 3, + alarmDescription: 'ECS service CPU utilization high', +}); +``` + +### Memory Utilization Alarm +```typescript +new cloudwatch.Alarm(this, 'ECSMemoryAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/ECS', + metricName: 'MemoryUtilization', + dimensionsMap: { + ServiceName: service.serviceName, + ClusterName: cluster.clusterName, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 85, + evaluationPeriods: 2, + alarmDescription: 'ECS service memory utilization high', +}); +``` + +## SQS Queues + +### Queue Depth Alarm +```typescript +new cloudwatch.Alarm(this, 'SQSDepthAlarm', { + metric: queue.metricApproximateNumberOfMessagesVisible({ + statistic: 'Maximum', + period: Duration.minutes(5), + }), + threshold: 1000, + evaluationPeriods: 2, + alarmDescription: 'SQS queue depth exceeded threshold', +}); +``` + +### Age of Oldest Message Alarm +```typescript +new cloudwatch.Alarm(this, 'SQSAgeAlarm', { + metric: queue.metricApproximateAgeOfOldestMessage({ + statistic: 'Maximum', + period: Duration.minutes(5), + }), + threshold: 300, // 5 minutes in seconds + evaluationPeriods: 1, + alarmDescription: 'SQS messages not being processed timely', +}); +``` + +## Application Load Balancer + +### Target Health Alarm +```typescript +new cloudwatch.Alarm(this, 'ALBUnhealthyTargetAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/ApplicationELB', + metricName: 'UnHealthyHostCount', + dimensionsMap: { + LoadBalancer: loadBalancer.loadBalancerFullName, + TargetGroup: targetGroup.targetGroupFullName, + }, + statistic: 'Average', + period: Duration.minutes(5), + }), + threshold: 1, + evaluationPeriods: 2, + alarmDescription: 'ALB has unhealthy targets', +}); +``` + +### HTTP 5XX Alarm +```typescript +new cloudwatch.Alarm(this, 'ALB5xxAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/ApplicationELB', + metricName: 'HTTPCode_Target_5XX_Count', + dimensionsMap: { + LoadBalancer: loadBalancer.loadBalancerFullName, + }, + statistic: 'Sum', + period: Duration.minutes(5), + }), + threshold: 10, + evaluationPeriods: 1, + alarmDescription: 'ALB target 5XX errors exceeded threshold', +}); +``` + +### Response Time Alarm +```typescript +new cloudwatch.Alarm(this, 'ALBLatencyAlarm', { + metric: new cloudwatch.Metric({ + namespace: 'AWS/ApplicationELB', + metricName: 'TargetResponseTime', + dimensionsMap: { + LoadBalancer: loadBalancer.loadBalancerFullName, + }, + statistic: 'p99', + period: Duration.minutes(5), + }), + threshold: 1, // 1 second + evaluationPeriods: 2, + alarmDescription: 'ALB p99 response time exceeded threshold', +}); +``` + +## Composite Alarms + +### Service Health Composite Alarm +```typescript +const errorAlarm = new cloudwatch.Alarm(this, 'ErrorAlarm', { /* ... */ }); +const latencyAlarm = new cloudwatch.Alarm(this, 'LatencyAlarm', { /* ... */ }); +const throttleAlarm = new cloudwatch.Alarm(this, 'ThrottleAlarm', { /* ... */ }); + +new cloudwatch.CompositeAlarm(this, 'ServiceHealthAlarm', { + compositeAlarmName: 'service-health', + alarmRule: cloudwatch.AlarmRule.anyOf( + errorAlarm, + latencyAlarm, + throttleAlarm + ), + alarmDescription: 'Overall service health degraded', +}); +``` + +## Alarm Actions + +### SNS Topic Integration +```typescript +const topic = new sns.Topic(this, 'AlarmTopic', { + displayName: 'CloudWatch Alarms', +}); + +// Email subscription +topic.addSubscription(new subscriptions.EmailSubscription('ops@example.com')); + +// Add action to alarm +alarm.addAlarmAction(new actions.SnsAction(topic)); +alarm.addOkAction(new actions.SnsAction(topic)); +``` + +### Auto Scaling Action +```typescript +const scalingAction = targetGroup.scaleOnMetric('ScaleUp', { + metric: targetGroup.metricTargetResponseTime(), + scalingSteps: [ + { upper: 1, change: 0 }, + { lower: 1, change: +1 }, + { lower: 2, change: +2 }, + ], +}); +``` + +## Alarm Best Practices + +### Threshold Selection + +**CPU/Memory Alarms**: +- Warning: 70-80% +- Critical: 80-90% +- Consider burst patterns and normal usage + +**Error Rate Alarms**: +- Threshold based on SLA (e.g., 99.9% = 0.1% error rate) +- Account for normal error rates +- Different thresholds for different error types + +**Latency Alarms**: +- p99 latency for user-facing APIs +- Warning: 80% of SLA target +- Critical: 100% of SLA target + +### Evaluation Periods + +**Fast-changing metrics** (1-2 periods): +- Error counts +- Failed health checks +- Critical application errors + +**Slow-changing metrics** (3-5 periods): +- CPU utilization +- Memory usage +- Disk usage + +**Cost-related metrics** (longer periods): +- Daily spending +- Resource count changes +- Usage patterns + +### Missing Data Handling + +```typescript +// For intermittent workloads +alarm.treatMissingData(cloudwatch.TreatMissingData.NOT_BREACHING); + +// For always-on services +alarm.treatMissingData(cloudwatch.TreatMissingData.BREACHING); + +// To distinguish from data issues +alarm.treatMissingData(cloudwatch.TreatMissingData.MISSING); +``` + +### Alarm Naming Conventions + +```typescript +// Pattern: -- +'lambda-errors-critical' +'api-latency-warning' +'rds-cpu-warning' +'ecs-tasks-critical' +``` + +### Alarm Actions Best Practices + +1. **Separate topics by severity**: + - Critical alarms → PagerDuty/on-call + - Warning alarms → Slack/email + - Info alarms → Metrics dashboard + +2. **Include context in alarm description**: + - Service name + - Expected threshold + - Troubleshooting runbook link + +3. **Auto-remediation where possible**: + - Lambda errors → automatic retry + - CPU high → auto-scaling trigger + - Disk full → automated cleanup + +4. **Alarm fatigue prevention**: + - Tune thresholds based on actual patterns + - Use composite alarms to reduce noise + - Implement proper evaluation periods + - Regularly review and adjust alarms + +## Monitoring Dashboard + +### Recommended Dashboard Layout + +**Service Overview**: +- Request count and rate +- Error count and percentage +- Latency (p50, p95, p99) +- Availability percentage + +**Resource Utilization**: +- CPU utilization by service +- Memory utilization by service +- Network throughput +- Disk I/O + +**Cost Metrics**: +- Daily spending by service +- Month-to-date costs +- Budget utilization +- Cost anomalies + +**Security Metrics**: +- Failed login attempts +- IAM policy changes +- Security group modifications +- GuardDuty findings diff --git a/skills/aws-cost-operations/references/operations-patterns.md b/skills/aws-cost-operations/references/operations-patterns.md new file mode 100644 index 0000000..0ef2f71 --- /dev/null +++ b/skills/aws-cost-operations/references/operations-patterns.md @@ -0,0 +1,394 @@ +# AWS Cost & Operations Patterns + +Comprehensive patterns and best practices for AWS cost optimization, monitoring, and operational excellence. + +## Table of Contents + +- [Cost Optimization Patterns](#cost-optimization-patterns) +- [Monitoring Patterns](#monitoring-patterns) +- [Observability Patterns](#observability-patterns) +- [Security and Audit Patterns](#security-and-audit-patterns) +- [Troubleshooting Workflows](#troubleshooting-workflows) + +## Cost Optimization Patterns + +### Pattern 1: Cost Estimation Before Deployment + +**When**: Before deploying any new infrastructure + +**MCP Server**: AWS Pricing MCP + +**Steps**: +1. List all resources to be deployed +2. Query pricing for each resource type +3. Calculate monthly costs based on expected usage +4. Compare pricing across regions +5. Document cost estimates in architecture docs + +**Example**: +``` +Resource: Lambda Function +- Invocations: 1,000,000/month +- Duration: 3 seconds avg +- Memory: 512 MB +- Region: us-east-1 +Estimated cost: $X/month +``` + +### Pattern 2: Monthly Cost Review + +**When**: First week of every month + +**MCP Servers**: Cost Explorer MCP, Billing and Cost Management MCP + +**Steps**: +1. Review total spending vs. budget +2. Analyze cost by service (top 5 services) +3. Identify cost anomalies (>20% increase) +4. Review cost by environment (dev/staging/prod) +5. Check cost allocation tag coverage +6. Generate cost optimization recommendations + +**Key Metrics**: +- Month-over-month cost change +- Cost per environment +- Cost per application/project +- Untagged resource costs + +### Pattern 3: Right-Sizing Resources + +**When**: Quarterly or when utilization alerts trigger + +**MCP Servers**: CloudWatch MCP, Cost Explorer MCP + +**Steps**: +1. Query CloudWatch for resource utilization metrics +2. Identify over-provisioned resources (< 40% utilization) +3. Identify under-provisioned resources (> 80% utilization) +4. Calculate potential savings from right-sizing +5. Plan and execute right-sizing changes +6. Monitor post-change performance + +**Common Right-Sizing Scenarios**: +- EC2 instances with low CPU utilization +- RDS instances with excess capacity +- DynamoDB tables with low read/write usage +- Lambda functions with excessive memory allocation + +### Pattern 4: Unused Resource Cleanup + +**When**: Monthly or triggered by cost anomalies + +**MCP Servers**: Cost Explorer MCP, CloudTrail MCP + +**Steps**: +1. Identify resources with zero usage +2. Query CloudTrail for last access time +3. Tag resources for deletion review +4. Notify resource owners +5. Delete confirmed unused resources +6. Track cost savings + +**Common Unused Resources**: +- Unattached EBS volumes +- Old EBS snapshots +- Idle Load Balancers +- Unused Elastic IPs +- Old AMIs and snapshots +- Stopped EC2 instances (long-term) + +## Monitoring Patterns + +### Pattern 1: Critical Service Monitoring + +**When**: All production services + +**MCP Server**: CloudWatch MCP + +**Metrics to Monitor**: +- **Availability**: Service uptime, health checks +- **Performance**: Latency, response time +- **Errors**: Error rate, failed requests +- **Saturation**: CPU, memory, disk, network utilization + +**Alarm Thresholds** (adjust based on SLAs): +- Error rate: > 1% for 2 consecutive periods +- Latency: p99 > 1 second for 5 minutes +- CPU: > 80% for 10 minutes +- Memory: > 85% for 5 minutes + +### Pattern 2: Lambda Function Monitoring + +**MCP Server**: CloudWatch MCP + +**Key Metrics**: +``` +- Invocations (Count) +- Errors (Count, %) +- Duration (Average, p99) +- Throttles (Count) +- ConcurrentExecutions (Max) +- IteratorAge (for stream processing) +``` + +**Recommended Alarms**: +- Error rate > 1% +- Duration > 80% of timeout +- Throttles > 0 +- ConcurrentExecutions > 80% of reserved + +### Pattern 3: API Gateway Monitoring + +**MCP Server**: CloudWatch MCP + +**Key Metrics**: +``` +- Count (Total requests) +- 4XXError, 5XXError +- Latency (p50, p95, p99) +- IntegrationLatency +- CacheHitCount, CacheMissCount +``` + +**Recommended Alarms**: +- 5XX error rate > 0.5% +- 4XX error rate > 5% +- Latency p99 > 2 seconds +- Integration latency spike + +### Pattern 4: Database Monitoring + +**MCP Server**: CloudWatch MCP + +**RDS Metrics**: +``` +- CPUUtilization +- DatabaseConnections +- FreeableMemory +- ReadLatency, WriteLatency +- ReadIOPS, WriteIOPS +- FreeStorageSpace +``` + +**DynamoDB Metrics**: +``` +- ConsumedReadCapacityUnits +- ConsumedWriteCapacityUnits +- UserErrors +- SystemErrors +- ThrottledRequests +``` + +**Recommended Alarms**: +- RDS CPU > 80% for 10 minutes +- RDS connections > 80% of max +- RDS free storage < 10 GB +- DynamoDB throttled requests > 0 +- DynamoDB user errors spike + +## Observability Patterns + +### Pattern 1: Distributed Tracing Setup + +**MCP Server**: CloudWatch Application Signals MCP + +**Components**: +1. **Service Map**: Visualize service dependencies +2. **Traces**: Track requests across services +3. **Metrics**: Monitor latency and errors per service +4. **SLOs**: Define and track service level objectives + +**Implementation**: +- Enable X-Ray tracing on Lambda functions +- Add X-Ray SDK to application code +- Configure sampling rules +- Create service lens dashboards + +### Pattern 2: Log Aggregation and Analysis + +**MCP Server**: CloudWatch MCP + +**Log Strategy**: +1. **Centralize Logs**: Send all application logs to CloudWatch Logs +2. **Structure Logs**: Use JSON format for structured logging +3. **Log Insights**: Use CloudWatch Logs Insights for queries +4. **Retention**: Set appropriate retention periods + +**Example Log Insights Queries**: +``` +# Find errors in last hour +fields @timestamp, @message +| filter @message like /ERROR/ +| sort @timestamp desc +| limit 100 + +# Count errors by type +stats count() by error_type +| sort count desc + +# Calculate p99 latency +stats percentile(duration, 99) by service_name +``` + +### Pattern 3: Custom Metrics + +**MCP Server**: CloudWatch MCP + +**When to Use Custom Metrics**: +- Business-specific KPIs (orders/minute, revenue/hour) +- Application-specific metrics (cache hit rate, queue depth) +- Performance metrics not provided by AWS + +**Best Practices**: +- Use consistent namespace: `CompanyName/ApplicationName` +- Include relevant dimensions (environment, region, version) +- Publish metrics at appropriate intervals +- Use metric filters for log-derived metrics + +## Security and Audit Patterns + +### Pattern 1: API Activity Auditing + +**MCP Server**: CloudTrail MCP + +**Regular Audit Queries**: +``` +# Find all IAM changes +eventName: CreateUser, DeleteUser, AttachUserPolicy, etc. +Time: Last 24 hours + +# Track S3 bucket deletions +eventName: DeleteBucket +Time: Last 7 days + +# Find failed login attempts +eventName: ConsoleLogin +errorCode: Failure + +# Monitor privileged actions +userIdentity.arn: *admin* OR *root* +``` + +**Audit Schedule**: +- Daily: Review privileged user actions +- Weekly: Audit IAM changes and security group modifications +- Monthly: Comprehensive security review + +### Pattern 2: Security Posture Assessment + +**MCP Server**: Well-Architected Security Assessment Tool MCP + +**Assessment Areas**: +1. **Identity and Access Management** + - Least privilege implementation + - MFA enforcement + - Role-based access control + - Service control policies + +2. **Detective Controls** + - CloudTrail enabled in all regions + - GuardDuty findings review + - Config rule compliance + - Security Hub findings + +3. **Infrastructure Protection** + - VPC security groups review + - Network ACLs configuration + - AWS WAF rules + - Security group ingress rules + +4. **Data Protection** + - Encryption at rest (S3, EBS, RDS) + - Encryption in transit (TLS/SSL) + - KMS key usage and rotation + - Secrets Manager utilization + +5. **Incident Response** + - IR playbooks documented + - Automated response procedures + - Contact information current + - Regular IR drills + +**Assessment Frequency**: +- Quarterly: Full Well-Architected review +- Monthly: High-priority findings review +- Weekly: Critical security findings + +### Pattern 3: Compliance Monitoring + +**MCP Servers**: CloudTrail MCP, CloudWatch MCP + +**Compliance Requirements**: +- Data residency (ensure data stays in approved regions) +- Access logging (all access logged and retained) +- Encryption requirements (data encrypted at rest and in transit) +- Change management (all changes tracked in CloudTrail) + +**Compliance Dashboards**: +- Encryption coverage by service +- CloudTrail logging status +- Failed login attempts +- Privileged access usage +- Non-compliant resources + +## Troubleshooting Workflows + +### Workflow 1: High Lambda Error Rate + +**MCP Servers**: CloudWatch MCP, CloudWatch Application Signals MCP + +**Steps**: +1. Query CloudWatch for Lambda error metrics +2. Check error logs in CloudWatch Logs +3. Identify error patterns (timeout, memory, permission) +4. Check Lambda configuration (memory, timeout, permissions) +5. Review recent code deployments +6. Check downstream service health +7. Implement fix and monitor + +### Workflow 2: Increased Latency + +**MCP Servers**: CloudWatch MCP, CloudWatch Application Signals MCP + +**Steps**: +1. Identify latency spike in CloudWatch metrics +2. Check service map for slow dependencies +3. Query distributed traces for slow requests +4. Check database query performance +5. Review API Gateway integration latency +6. Check Lambda cold starts +7. Identify bottleneck and optimize + +### Workflow 3: Cost Spike Investigation + +**MCP Servers**: Cost Explorer MCP, CloudWatch MCP, CloudTrail MCP + +**Steps**: +1. Use Cost Explorer to identify service causing spike +2. Check CloudWatch metrics for usage increase +3. Review CloudTrail for recent resource creation +4. Identify root cause (misconfiguration, runaway process, attack) +5. Implement cost controls (budgets, alarms, service quotas) +6. Clean up unnecessary resources + +### Workflow 4: Security Incident Response + +**MCP Servers**: CloudTrail MCP, GuardDuty (via CloudWatch), Well-Architected Assessment MCP + +**Steps**: +1. Identify security event in GuardDuty or CloudWatch +2. Query CloudTrail for related API activity +3. Determine scope and impact +4. Isolate affected resources +5. Revoke compromised credentials +6. Implement remediation +7. Conduct post-incident review +8. Update security controls + +## Summary + +- **Cost Optimization**: Use Pricing, Cost Explorer, and Billing MCPs for proactive cost management +- **Monitoring**: Set up comprehensive CloudWatch alarms for all critical services +- **Observability**: Implement distributed tracing and structured logging +- **Security**: Regular CloudTrail audits and Well-Architected assessments +- **Proactive**: Don't wait for incidents - monitor and optimize continuously