zhongwei/gh-zxkane-aws-skills-aws-cost-ops

Fork 0

Files

Zhongwei Li 0364d81c91 Initial commit

2025-11-30 09:08:44 +08:00

13 KiB

Raw Blame History

CloudWatch Alarms Reference

Common CloudWatch alarm configurations for AWS services.

Lambda Functions

Error Rate Alarm

new cloudwatch.Alarm(this, 'LambdaErrorAlarm', {
  metric: lambdaFunction.metricErrors({
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 10,
  evaluationPeriods: 1,
  treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
  alarmDescription: 'Lambda error count exceeded threshold',
});

Duration Alarm (Approaching Timeout)

new cloudwatch.Alarm(this, 'LambdaDurationAlarm', {
  metric: lambdaFunction.metricDuration({
    statistic: 'Maximum',
    period: Duration.minutes(5),
  }),
  threshold: lambdaFunction.timeout.toMilliseconds() * 0.8, // 80% of timeout
  evaluationPeriods: 2,
  alarmDescription: 'Lambda duration approaching timeout',
});

Throttle Alarm

new cloudwatch.Alarm(this, 'LambdaThrottleAlarm', {
  metric: lambdaFunction.metricThrottles({
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 5,
  evaluationPeriods: 1,
  alarmDescription: 'Lambda function is being throttled',
});

Concurrent Executions Alarm

new cloudwatch.Alarm(this, 'LambdaConcurrencyAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/Lambda',
    metricName: 'ConcurrentExecutions',
    dimensionsMap: {
      FunctionName: lambdaFunction.functionName,
    },
    statistic: 'Maximum',
    period: Duration.minutes(1),
  }),
  threshold: 100, // Adjust based on reserved concurrency
  evaluationPeriods: 2,
  alarmDescription: 'Lambda concurrent executions high',
});

API Gateway

5XX Error Rate Alarm

new cloudwatch.Alarm(this, 'Api5xxAlarm', {
  metric: api.metricServerError({
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 10,
  evaluationPeriods: 1,
  alarmDescription: 'API Gateway 5XX errors exceeded threshold',
});

4XX Error Rate Alarm

new cloudwatch.Alarm(this, 'Api4xxAlarm', {
  metric: api.metricClientError({
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 50,
  evaluationPeriods: 2,
  alarmDescription: 'API Gateway 4XX errors exceeded threshold',
});

Latency Alarm

new cloudwatch.Alarm(this, 'ApiLatencyAlarm', {
  metric: api.metricLatency({
    statistic: 'p99',
    period: Duration.minutes(5),
  }),
  threshold: 2000, // 2 seconds
  evaluationPeriods: 2,
  alarmDescription: 'API Gateway p99 latency exceeded threshold',
});

DynamoDB

Read Throttle Alarm

new cloudwatch.Alarm(this, 'DynamoDBReadThrottleAlarm', {
  metric: table.metricUserErrors({
    dimensions: {
      Operation: 'GetItem',
    },
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 5,
  evaluationPeriods: 1,
  alarmDescription: 'DynamoDB read operations being throttled',
});

Write Throttle Alarm

new cloudwatch.Alarm(this, 'DynamoDBWriteThrottleAlarm', {
  metric: table.metricUserErrors({
    dimensions: {
      Operation: 'PutItem',
    },
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 5,
  evaluationPeriods: 1,
  alarmDescription: 'DynamoDB write operations being throttled',
});

Consumed Capacity Alarm

new cloudwatch.Alarm(this, 'DynamoDBCapacityAlarm', {
  metric: table.metricConsumedReadCapacityUnits({
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: provisionedCapacity * 0.8, // 80% of provisioned
  evaluationPeriods: 2,
  alarmDescription: 'DynamoDB consumed capacity approaching limit',
});

EC2 Instances

CPU Utilization Alarm

new cloudwatch.Alarm(this, 'EC2CpuAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/EC2',
    metricName: 'CPUUtilization',
    dimensionsMap: {
      InstanceId: instance.instanceId,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 80,
  evaluationPeriods: 3,
  alarmDescription: 'EC2 CPU utilization high',
});

Status Check Failed Alarm

new cloudwatch.Alarm(this, 'EC2StatusCheckAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/EC2',
    metricName: 'StatusCheckFailed',
    dimensionsMap: {
      InstanceId: instance.instanceId,
    },
    statistic: 'Maximum',
    period: Duration.minutes(1),
  }),
  threshold: 1,
  evaluationPeriods: 2,
  alarmDescription: 'EC2 status check failed',
});

Disk Space Alarm (Requires CloudWatch Agent)

new cloudwatch.Alarm(this, 'EC2DiskAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'CWAgent',
    metricName: 'disk_used_percent',
    dimensionsMap: {
      InstanceId: instance.instanceId,
      path: '/',
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 85,
  evaluationPeriods: 2,
  alarmDescription: 'EC2 disk space usage high',
});

RDS Databases

CPU Alarm

new cloudwatch.Alarm(this, 'RDSCpuAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/RDS',
    metricName: 'CPUUtilization',
    dimensionsMap: {
      DBInstanceIdentifier: dbInstance.instanceIdentifier,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 80,
  evaluationPeriods: 3,
  alarmDescription: 'RDS CPU utilization high',
});

Connection Count Alarm

new cloudwatch.Alarm(this, 'RDSConnectionAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/RDS',
    metricName: 'DatabaseConnections',
    dimensionsMap: {
      DBInstanceIdentifier: dbInstance.instanceIdentifier,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: maxConnections * 0.8, // 80% of max connections
  evaluationPeriods: 2,
  alarmDescription: 'RDS connection count approaching limit',
});

Free Storage Space Alarm

new cloudwatch.Alarm(this, 'RDSStorageAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/RDS',
    metricName: 'FreeStorageSpace',
    dimensionsMap: {
      DBInstanceIdentifier: dbInstance.instanceIdentifier,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 10 * 1024 * 1024 * 1024, // 10 GB in bytes
  comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
  evaluationPeriods: 1,
  alarmDescription: 'RDS free storage space low',
});

ECS Services

Task Count Alarm

new cloudwatch.Alarm(this, 'ECSTaskCountAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'ECS/ContainerInsights',
    metricName: 'RunningTaskCount',
    dimensionsMap: {
      ServiceName: service.serviceName,
      ClusterName: cluster.clusterName,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 1,
  comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
  evaluationPeriods: 2,
  alarmDescription: 'ECS service has no running tasks',
});

CPU Utilization Alarm

new cloudwatch.Alarm(this, 'ECSCpuAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/ECS',
    metricName: 'CPUUtilization',
    dimensionsMap: {
      ServiceName: service.serviceName,
      ClusterName: cluster.clusterName,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 80,
  evaluationPeriods: 3,
  alarmDescription: 'ECS service CPU utilization high',
});

Memory Utilization Alarm

new cloudwatch.Alarm(this, 'ECSMemoryAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/ECS',
    metricName: 'MemoryUtilization',
    dimensionsMap: {
      ServiceName: service.serviceName,
      ClusterName: cluster.clusterName,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 85,
  evaluationPeriods: 2,
  alarmDescription: 'ECS service memory utilization high',
});

SQS Queues

Queue Depth Alarm

new cloudwatch.Alarm(this, 'SQSDepthAlarm', {
  metric: queue.metricApproximateNumberOfMessagesVisible({
    statistic: 'Maximum',
    period: Duration.minutes(5),
  }),
  threshold: 1000,
  evaluationPeriods: 2,
  alarmDescription: 'SQS queue depth exceeded threshold',
});

Age of Oldest Message Alarm

new cloudwatch.Alarm(this, 'SQSAgeAlarm', {
  metric: queue.metricApproximateAgeOfOldestMessage({
    statistic: 'Maximum',
    period: Duration.minutes(5),
  }),
  threshold: 300, // 5 minutes in seconds
  evaluationPeriods: 1,
  alarmDescription: 'SQS messages not being processed timely',
});

Application Load Balancer

Target Health Alarm

new cloudwatch.Alarm(this, 'ALBUnhealthyTargetAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/ApplicationELB',
    metricName: 'UnHealthyHostCount',
    dimensionsMap: {
      LoadBalancer: loadBalancer.loadBalancerFullName,
      TargetGroup: targetGroup.targetGroupFullName,
    },
    statistic: 'Average',
    period: Duration.minutes(5),
  }),
  threshold: 1,
  evaluationPeriods: 2,
  alarmDescription: 'ALB has unhealthy targets',
});

HTTP 5XX Alarm

new cloudwatch.Alarm(this, 'ALB5xxAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/ApplicationELB',
    metricName: 'HTTPCode_Target_5XX_Count',
    dimensionsMap: {
      LoadBalancer: loadBalancer.loadBalancerFullName,
    },
    statistic: 'Sum',
    period: Duration.minutes(5),
  }),
  threshold: 10,
  evaluationPeriods: 1,
  alarmDescription: 'ALB target 5XX errors exceeded threshold',
});

Response Time Alarm

new cloudwatch.Alarm(this, 'ALBLatencyAlarm', {
  metric: new cloudwatch.Metric({
    namespace: 'AWS/ApplicationELB',
    metricName: 'TargetResponseTime',
    dimensionsMap: {
      LoadBalancer: loadBalancer.loadBalancerFullName,
    },
    statistic: 'p99',
    period: Duration.minutes(5),
  }),
  threshold: 1, // 1 second
  evaluationPeriods: 2,
  alarmDescription: 'ALB p99 response time exceeded threshold',
});

Composite Alarms

Service Health Composite Alarm

const errorAlarm = new cloudwatch.Alarm(this, 'ErrorAlarm', { /* ... */ });
const latencyAlarm = new cloudwatch.Alarm(this, 'LatencyAlarm', { /* ... */ });
const throttleAlarm = new cloudwatch.Alarm(this, 'ThrottleAlarm', { /* ... */ });

new cloudwatch.CompositeAlarm(this, 'ServiceHealthAlarm', {
  compositeAlarmName: 'service-health',
  alarmRule: cloudwatch.AlarmRule.anyOf(
    errorAlarm,
    latencyAlarm,
    throttleAlarm
  ),
  alarmDescription: 'Overall service health degraded',
});

Alarm Actions

const topic = new sns.Topic(this, 'AlarmTopic', {
  displayName: 'CloudWatch Alarms',
});

// Email subscription
topic.addSubscription(new subscriptions.EmailSubscription('ops@example.com'));

// Add action to alarm
alarm.addAlarmAction(new actions.SnsAction(topic));
alarm.addOkAction(new actions.SnsAction(topic));

Auto Scaling Action

const scalingAction = targetGroup.scaleOnMetric('ScaleUp', {
  metric: targetGroup.metricTargetResponseTime(),
  scalingSteps: [
    { upper: 1, change: 0 },
    { lower: 1, change: +1 },
    { lower: 2, change: +2 },
  ],
});

Alarm Best Practices

Threshold Selection

CPU/Memory Alarms:

Warning: 70-80%
Critical: 80-90%
Consider burst patterns and normal usage

Error Rate Alarms:

Threshold based on SLA (e.g., 99.9% = 0.1% error rate)
Account for normal error rates
Different thresholds for different error types

Latency Alarms:

p99 latency for user-facing APIs
Warning: 80% of SLA target
Critical: 100% of SLA target

Evaluation Periods

Fast-changing metrics (1-2 periods):

Error counts
Failed health checks
Critical application errors

Slow-changing metrics (3-5 periods):

CPU utilization
Memory usage
Disk usage

Cost-related metrics (longer periods):

Daily spending
Resource count changes
Usage patterns

Missing Data Handling

// For intermittent workloads
alarm.treatMissingData(cloudwatch.TreatMissingData.NOT_BREACHING);

// For always-on services
alarm.treatMissingData(cloudwatch.TreatMissingData.BREACHING);

// To distinguish from data issues
alarm.treatMissingData(cloudwatch.TreatMissingData.MISSING);

Alarm Naming Conventions

// Pattern: <service>-<metric>-<severity>
'lambda-errors-critical'
'api-latency-warning'
'rds-cpu-warning'
'ecs-tasks-critical'

Alarm Actions Best Practices

Separate topics by severity:
- Critical alarms → PagerDuty/on-call
- Warning alarms → Slack/email
- Info alarms → Metrics dashboard
Include context in alarm description:
- Service name
- Expected threshold
- Troubleshooting runbook link
Auto-remediation where possible:
- Lambda errors → automatic retry
- CPU high → auto-scaling trigger
- Disk full → automated cleanup
Alarm fatigue prevention:
- Tune thresholds based on actual patterns
- Use composite alarms to reduce noise
- Implement proper evaluation periods
- Regularly review and adjust alarms

Monitoring Dashboard

Recommended Dashboard Layout

Service Overview:

Request count and rate
Error count and percentage
Latency (p50, p95, p99)
Availability percentage

Resource Utilization:

CPU utilization by service
Memory utilization by service
Network throughput
Disk I/O

Cost Metrics:

Daily spending by service
Month-to-date costs
Budget utilization
Cost anomalies

Security Metrics:

Failed login attempts
IAM policy changes
Security group modifications
GuardDuty findings

13 KiB Raw Blame History

CloudWatch Alarms Reference

Lambda Functions

Error Rate Alarm

Duration Alarm (Approaching Timeout)

Throttle Alarm

Concurrent Executions Alarm

API Gateway

5XX Error Rate Alarm

4XX Error Rate Alarm

Latency Alarm

DynamoDB

Read Throttle Alarm

Write Throttle Alarm

Consumed Capacity Alarm

EC2 Instances

CPU Utilization Alarm

Status Check Failed Alarm

Disk Space Alarm (Requires CloudWatch Agent)

RDS Databases

CPU Alarm

Connection Count Alarm

Free Storage Space Alarm

ECS Services

Task Count Alarm

CPU Utilization Alarm

Memory Utilization Alarm

SQS Queues

Queue Depth Alarm

Age of Oldest Message Alarm

Application Load Balancer

Target Health Alarm

HTTP 5XX Alarm

Response Time Alarm

Composite Alarms

Service Health Composite Alarm

Alarm Actions

SNS Topic Integration

Auto Scaling Action

Alarm Best Practices

Threshold Selection

Evaluation Periods

Missing Data Handling

Alarm Naming Conventions

Alarm Actions Best Practices

Monitoring Dashboard

Recommended Dashboard Layout

13 KiB

Raw Blame History