#!/bin/bash # health-check.sh # Quick system health check across all layers # Usage: ./health-check.sh set -e echo "=========================================" echo "SYSTEM HEALTH CHECK" echo "=========================================" echo "Date: $(date)" echo "" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # Thresholds CPU_WARNING=70 CPU_CRITICAL=90 MEM_WARNING=80 MEM_CRITICAL=90 DISK_WARNING=80 DISK_CRITICAL=90 # Helper function for status print_status() { local metric=$1 local value=$2 local warning=$3 local critical=$4 local unit=$5 if (( $(echo "$value >= $critical" | bc -l) )); then echo -e "${RED}✗ $metric: ${value}${unit} (CRITICAL)${NC}" return 2 elif (( $(echo "$value >= $warning" | bc -l) )); then echo -e "${YELLOW}⚠ $metric: ${value}${unit} (WARNING)${NC}" return 1 else echo -e "${GREEN}✓ $metric: ${value}${unit} (OK)${NC}" return 0 fi } # 1. CPU Check echo "1. CPU Usage" echo "-------------" CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}') print_status "CPU" "$CPU_USAGE" "$CPU_WARNING" "$CPU_CRITICAL" "%" # Top CPU processes echo " Top 5 CPU processes:" ps aux | sort -nrk 3,3 | head -5 | awk '{printf " - %s (PID %s): %.1f%%\n", $11, $2, $3}' echo "" # 2. Memory Check echo "2. Memory Usage" echo "---------------" MEM_USAGE=$(free | grep Mem | awk '{print ($3/$2) * 100.0}') print_status "Memory" "$MEM_USAGE" "$MEM_WARNING" "$MEM_CRITICAL" "%" # Memory details free -h | grep -E "Mem|Swap" | awk '{printf " %s: %s used / %s total\n", $1, $3, $2}' # Top memory processes echo " Top 5 memory processes:" ps aux | sort -nrk 4,4 | head -5 | awk '{printf " - %s (PID %s): %.1f%%\n", $11, $2, $4}' echo "" # 3. Disk Check echo "3. Disk Usage" echo "-------------" df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | while read line; do DISK=$(echo $line | awk '{print $1}') MOUNT=$(echo $line | awk '{print $6}') USAGE=$(echo $line | awk '{print $5}' | sed 's/%//') print_status "$MOUNT" "$USAGE" "$DISK_WARNING" "$DISK_CRITICAL" "%" done # Disk I/O echo " Disk I/O:" if command -v iostat &> /dev/null; then iostat -x 1 2 | tail -n +4 | awk 'NR>1 {printf " %s: %.1f%% utilization\n", $1, $NF}' else echo " (iostat not installed)" fi echo "" # 4. Network Check echo "4. Network" echo "----------" # Check connectivity if ping -c 1 -W 2 8.8.8.8 &> /dev/null; then echo -e "${GREEN}✓ Internet connectivity: OK${NC}" else echo -e "${RED}✗ Internet connectivity: FAILED${NC}" fi # DNS check if nslookup google.com &> /dev/null; then echo -e "${GREEN}✓ DNS resolution: OK${NC}" else echo -e "${RED}✗ DNS resolution: FAILED${NC}" fi # Connection count CONN_COUNT=$(netstat -an 2>/dev/null | grep ESTABLISHED | wc -l) echo " Active connections: $CONN_COUNT" echo "" # 5. Database Check (if PostgreSQL installed) echo "5. Database (PostgreSQL)" echo "------------------------" if command -v psql &> /dev/null; then # Try to connect if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then echo -e "${GREEN}✓ PostgreSQL: Running${NC}" # Connection count CONN=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;") MAX_CONN=$(sudo -u postgres psql -t -c "SHOW max_connections;") CONN_PCT=$(echo "scale=1; $CONN / $MAX_CONN * 100" | bc) print_status "Connections" "$CONN_PCT" "80" "90" "% ($CONN/$MAX_CONN)" # Database size echo " Database sizes:" sudo -u postgres psql -t -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;" | head -5 | awk '{printf " - %s: %s\n", $1, $3}' else echo -e "${RED}✗ PostgreSQL: Not accessible${NC}" fi else echo " PostgreSQL not installed" fi echo "" # 6. Services Check echo "6. Services" echo "-----------" # List of services to check (customize as needed) SERVICES=("nginx" "postgresql" "redis-server") for service in "${SERVICES[@]}"; do if systemctl is-active --quiet $service 2>/dev/null; then echo -e "${GREEN}✓ $service: Running${NC}" else if systemctl list-unit-files | grep -q "^$service"; then echo -e "${RED}✗ $service: Stopped${NC}" else echo " $service: Not installed" fi fi done echo "" # 7. API Response Time (if applicable) echo "7. API Health" echo "-------------" # Check localhost health endpoint if command -v curl &> /dev/null; then HEALTH_URL="http://localhost/health" # Time the request RESPONSE=$(curl -s -w "\n%{http_code}\n%{time_total}" -o /dev/null $HEALTH_URL 2>/dev/null) HTTP_CODE=$(echo "$RESPONSE" | sed -n '1p') TIME=$(echo "$RESPONSE" | sed -n '2p') if [ "$HTTP_CODE" = "200" ]; then TIME_MS=$(echo "$TIME * 1000" | bc) echo -e "${GREEN}✓ Health endpoint: Responding (${TIME_MS}ms)${NC}" else echo -e "${RED}✗ Health endpoint: Failed (HTTP $HTTP_CODE)${NC}" fi else echo " curl not installed" fi echo "" # 8. Load Average echo "8. Load Average" echo "---------------" LOAD=$(uptime | awk -F'load average:' '{ print $2 }') CORES=$(nproc) echo " Load: $LOAD" echo " CPU cores: $CORES" LOAD_1MIN=$(echo $LOAD | awk -F', ' '{print $1}' | xargs) LOAD_PER_CORE=$(echo "scale=2; $LOAD_1MIN / $CORES" | bc) if (( $(echo "$LOAD_PER_CORE >= 2.0" | bc -l) )); then echo -e "${RED}✗ Load per core: ${LOAD_PER_CORE} (HIGH)${NC}" elif (( $(echo "$LOAD_PER_CORE >= 1.0" | bc -l) )); then echo -e "${YELLOW}⚠ Load per core: ${LOAD_PER_CORE} (ELEVATED)${NC}" else echo -e "${GREEN}✓ Load per core: ${LOAD_PER_CORE} (OK)${NC}" fi echo "" # 9. Recent Errors echo "9. Recent Errors (last 10 minutes)" echo "-----------------------------------" if [ -f /var/log/syslog ]; then ERROR_COUNT=$(grep -c "error\|Error\|ERROR" /var/log/syslog 2>/dev/null | tail -1000 || echo 0) echo " Syslog errors: $ERROR_COUNT" fi # Check journal if systemd if command -v journalctl &> /dev/null; then JOURNAL_ERRORS=$(journalctl --since "10 minutes ago" --priority=err --no-pager | wc -l) echo " Journalctl errors: $JOURNAL_ERRORS" fi echo "" # Summary echo "=========================================" echo "SUMMARY" echo "=========================================" echo "Health check completed at $(date)" echo "" echo "Next steps:" echo "- If any CRITICAL issues, investigate immediately" echo "- If WARNING issues, monitor and plan mitigation" echo "- Review playbooks: ../playbooks/" echo ""