Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:19:24 +08:00
commit 74075be734
22 changed files with 2851 additions and 0 deletions

View File

@@ -0,0 +1,459 @@
---
name: fairdb-health-check
description: Comprehensive health check for FairDB PostgreSQL infrastructure
model: sonnet
---
# FairDB System Health Check
Perform a comprehensive health check of the FairDB PostgreSQL infrastructure including server resources, database status, backup integrity, and customer databases.
## System Health Overview
```bash
#!/bin/bash
# FairDB Comprehensive Health Check
echo "================================================"
echo " FairDB System Health Check"
echo " $(date '+%Y-%m-%d %H:%M:%S')"
echo "================================================"
```
## Step 1: Server Resources Check
```bash
echo -e "\n[1/10] SERVER RESOURCES"
echo "------------------------"
# CPU Usage
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "CPU Usage: ${CPU_USAGE}%"
if (( $(echo "$CPU_USAGE > 80" | bc -l) )); then
echo "⚠️ WARNING: High CPU usage detected"
fi
# Memory Usage
MEM_INFO=$(free -m | awk 'NR==2{printf "Memory: %s/%sMB (%.2f%%)\n", $3,$2,$3*100/$2 }')
echo "$MEM_INFO"
MEM_PERCENT=$(free | grep Mem | awk '{print $3/$2 * 100.0}')
if (( $(echo "$MEM_PERCENT > 90" | bc -l) )); then
echo "⚠️ WARNING: High memory usage detected"
fi
# Disk Usage
echo "Disk Usage:"
df -h | grep -E '^/dev/' | while read line; do
USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo $line | awk '{print $6}')
echo " $MOUNT: $line"
if [ $USAGE -gt 85 ]; then
echo " ⚠️ WARNING: Disk space critical on $MOUNT"
fi
done
# Load Average
LOAD=$(uptime | awk -F'load average:' '{print $2}')
echo "Load Average:$LOAD"
CORES=$(nproc)
LOAD_1=$(echo $LOAD | cut -d, -f1 | tr -d ' ')
if (( $(echo "$LOAD_1 > $CORES" | bc -l) )); then
echo "⚠️ WARNING: High load average detected"
fi
```
## Step 2: PostgreSQL Service Status
```bash
echo -e "\n[2/10] POSTGRESQL SERVICE"
echo "-------------------------"
# Check if PostgreSQL is running
if systemctl is-active --quiet postgresql; then
echo "✅ PostgreSQL service: RUNNING"
# Get version and uptime
sudo -u postgres psql -t -c "SELECT version();" | head -1
UPTIME=$(sudo -u postgres psql -t -c "
SELECT now() - pg_postmaster_start_time() as uptime;")
echo "Uptime: $UPTIME"
else
echo "❌ CRITICAL: PostgreSQL service is NOT running!"
echo "Attempting to start..."
sudo systemctl start postgresql
sleep 5
if systemctl is-active --quiet postgresql; then
echo "✅ Service restarted successfully"
else
echo "❌ Failed to start PostgreSQL - manual intervention required!"
exit 1
fi
fi
# Check PostgreSQL cluster status
sudo pg_lsclusters
```
## Step 3: Database Connections
```bash
echo -e "\n[3/10] DATABASE CONNECTIONS"
echo "---------------------------"
# Connection statistics
sudo -u postgres psql -t << EOF
SELECT
'Total Connections: ' || count(*) || '/' || setting AS connection_info
FROM pg_stat_activity, pg_settings
WHERE pg_settings.name = 'max_connections'
GROUP BY setting;
EOF
# Connections by database
echo -e "\nConnections by database:"
sudo -u postgres psql -t -c "
SELECT datname, count(*) as connections
FROM pg_stat_activity
GROUP BY datname
ORDER BY connections DESC;"
# Connections by user
echo -e "\nConnections by user:"
sudo -u postgres psql -t -c "
SELECT usename, count(*) as connections
FROM pg_stat_activity
GROUP BY usename
ORDER BY connections DESC;"
# Check for idle connections
IDLE_COUNT=$(sudo -u postgres psql -t -c "
SELECT count(*)
FROM pg_stat_activity
WHERE state = 'idle'
AND state_change < NOW() - INTERVAL '10 minutes';")
if [ $IDLE_COUNT -gt 10 ]; then
echo "⚠️ WARNING: $IDLE_COUNT idle connections older than 10 minutes"
fi
```
## Step 4: Database Performance Metrics
```bash
echo -e "\n[4/10] PERFORMANCE METRICS"
echo "--------------------------"
# Cache hit ratio
sudo -u postgres psql -t << 'EOF'
SELECT
'Cache Hit Ratio: ' ||
ROUND(100.0 * sum(heap_blks_hit) /
NULLIF(sum(heap_blks_hit) + sum(heap_blks_read), 0), 2) || '%'
FROM pg_statio_user_tables;
EOF
# Transaction statistics
sudo -u postgres psql -t -c "
SELECT
'Transactions: ' || xact_commit || ' commits, ' ||
xact_rollback || ' rollbacks, ' ||
ROUND(100.0 * xact_rollback / NULLIF(xact_commit + xact_rollback, 0), 2) || '% rollback rate'
FROM pg_stat_database
WHERE datname = 'postgres';"
# Longest running queries
echo -e "\nLong-running queries (>1 minute):"
sudo -u postgres psql -t -c "
SELECT pid, now() - query_start as duration,
LEFT(query, 50) as query_preview
FROM pg_stat_activity
WHERE state = 'active'
AND now() - query_start > interval '1 minute'
ORDER BY duration DESC
LIMIT 5;"
# Table bloat check
echo -e "\nTable bloat (top 5):"
sudo -u postgres psql -t << 'EOF'
SELECT
schemaname || '.' || tablename AS table,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size,
ROUND(100 * pg_total_relation_size(schemaname||'.'||tablename) /
NULLIF(sum(pg_total_relation_size(schemaname||'.'||tablename))
OVER (), 0), 2) AS percentage
FROM pg_tables
WHERE schemaname NOT IN ('pg_catalog', 'information_schema')
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 5;
EOF
```
## Step 5: Backup Status
```bash
echo -e "\n[5/10] BACKUP STATUS"
echo "--------------------"
# Check pgBackRest status
if command -v pgbackrest &> /dev/null; then
echo "pgBackRest Status:"
# Get all stanzas
STANZAS=$(sudo -u postgres pgbackrest info --output=json 2>/dev/null | jq -r '.[].name' 2>/dev/null)
if [ -z "$STANZAS" ]; then
echo "⚠️ WARNING: No backup stanzas configured"
else
for STANZA in $STANZAS; do
echo -e "\nStanza: $STANZA"
# Get last backup info
LAST_BACKUP=$(sudo -u postgres pgbackrest --stanza=$STANZA info --output=json 2>/dev/null | \
jq -r '.[] | select(.name=="'$STANZA'") | .backup[-1].timestamp.stop' 2>/dev/null)
if [ ! -z "$LAST_BACKUP" ]; then
echo " Last backup: $LAST_BACKUP"
# Calculate age in hours
BACKUP_AGE=$(( ($(date +%s) - $(date -d "$LAST_BACKUP" +%s)) / 3600 ))
if [ $BACKUP_AGE -gt 25 ]; then
echo " ⚠️ WARNING: Last backup is $BACKUP_AGE hours old"
else
echo " ✅ Backup is current ($BACKUP_AGE hours old)"
fi
else
echo " ❌ ERROR: No backups found for this stanza"
fi
done
fi
else
echo "❌ ERROR: pgBackRest is not installed"
fi
# Check WAL archiving
WAL_STATUS=$(sudo -u postgres psql -t -c "SHOW archive_mode;")
echo -e "\nWAL Archiving: $WAL_STATUS"
if [ "$WAL_STATUS" = " on" ]; then
LAST_ARCHIVED=$(sudo -u postgres psql -t -c "
SELECT age(now(), last_archived_time)
FROM pg_stat_archiver;")
echo "Last WAL archived: $LAST_ARCHIVED ago"
fi
```
## Step 6: Replication Status
```bash
echo -e "\n[6/10] REPLICATION STATUS"
echo "-------------------------"
# Check if this is a primary or replica
IS_PRIMARY=$(sudo -u postgres psql -t -c "SELECT pg_is_in_recovery();")
if [ "$IS_PRIMARY" = " f" ]; then
echo "Role: PRIMARY"
# Check replication slots
REP_SLOTS=$(sudo -u postgres psql -t -c "
SELECT count(*) FROM pg_replication_slots WHERE active = true;")
echo "Active replication slots: $REP_SLOTS"
# Check connected replicas
sudo -u postgres psql -t -c "
SELECT client_addr, state, sync_state,
pg_size_pretty(pg_wal_lsn_diff(sent_lsn, replay_lsn)) as lag
FROM pg_stat_replication;" 2>/dev/null
else
echo "Role: REPLICA"
# Check replication lag
LAG=$(sudo -u postgres psql -t -c "
SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) AS lag;")
echo "Replication lag: ${LAG} seconds"
if (( $(echo "$LAG > 60" | bc -l) )); then
echo "⚠️ WARNING: High replication lag detected"
fi
fi
```
## Step 7: Security Audit
```bash
echo -e "\n[7/10] SECURITY AUDIT"
echo "---------------------"
# Check for default passwords
echo "Checking for common issues..."
# SSL status
SSL_STATUS=$(sudo -u postgres psql -t -c "SHOW ssl;")
echo "SSL: $SSL_STATUS"
if [ "$SSL_STATUS" != " on" ]; then
echo "⚠️ WARNING: SSL is not enabled"
fi
# Check for users without passwords
NO_PASS=$(sudo -u postgres psql -t -c "
SELECT count(*) FROM pg_shadow WHERE passwd IS NULL;")
if [ $NO_PASS -gt 0 ]; then
echo "⚠️ WARNING: $NO_PASS users without passwords"
fi
# Check firewall status
if sudo ufw status | grep -q "Status: active"; then
echo "✅ Firewall: ACTIVE"
else
echo "⚠️ WARNING: Firewall is not active"
fi
# Check fail2ban status
if systemctl is-active --quiet fail2ban; then
echo "✅ Fail2ban: RUNNING"
JAIL_STATUS=$(sudo fail2ban-client status postgresql 2>/dev/null | grep "Currently banned" || echo "Jail not configured")
echo " PostgreSQL jail: $JAIL_STATUS"
else
echo "⚠️ WARNING: Fail2ban is not running"
fi
```
## Step 8: Customer Database Health
```bash
echo -e "\n[8/10] CUSTOMER DATABASES"
echo "-------------------------"
# Check each customer database
CUSTOMER_DBS=$(sudo -u postgres psql -t -c "
SELECT datname FROM pg_database
WHERE datname NOT IN ('postgres', 'template0', 'template1')
ORDER BY datname;")
for DB in $CUSTOMER_DBS; do
echo -e "\nDatabase: $DB"
# Size
SIZE=$(sudo -u postgres psql -t -c "
SELECT pg_size_pretty(pg_database_size('$DB'));")
echo " Size: $SIZE"
# Connection count
CONN=$(sudo -u postgres psql -t -c "
SELECT count(*) FROM pg_stat_activity WHERE datname = '$DB';")
echo " Connections: $CONN"
# Transaction rate
TPS=$(sudo -u postgres psql -t -c "
SELECT xact_commit + xact_rollback as transactions
FROM pg_stat_database WHERE datname = '$DB';")
echo " Total transactions: $TPS"
# Check for locks
LOCKS=$(sudo -u postgres psql -t -d $DB -c "
SELECT count(*) FROM pg_locks WHERE granted = false;")
if [ $LOCKS -gt 0 ]; then
echo " ⚠️ WARNING: $LOCKS blocked locks detected"
fi
done
```
## Step 9: System Logs Analysis
```bash
echo -e "\n[9/10] LOG ANALYSIS"
echo "-------------------"
# Check PostgreSQL logs for errors
LOG_DIR="/var/log/postgresql"
if [ -d "$LOG_DIR" ]; then
echo "Recent PostgreSQL errors (last 24 hours):"
find $LOG_DIR -name "*.log" -mtime -1 -exec grep -i "error\|fatal\|panic" {} \; | \
tail -10 | head -5
ERROR_COUNT=$(find $LOG_DIR -name "*.log" -mtime -1 -exec grep -i "error\|fatal\|panic" {} \; | wc -l)
echo "Total errors in last 24 hours: $ERROR_COUNT"
if [ $ERROR_COUNT -gt 100 ]; then
echo "⚠️ WARNING: High error rate detected"
fi
fi
# Check system logs
echo -e "\nRecent system issues:"
sudo journalctl -p err -since "24 hours ago" --no-pager | tail -5
```
## Step 10: Recommendations
```bash
echo -e "\n[10/10] HEALTH SUMMARY & RECOMMENDATIONS"
echo "========================================="
# Collect all warnings
WARNINGS=0
CRITICAL=0
# Generate recommendations based on findings
echo -e "\nRecommendations:"
# Check if vacuum is needed
LAST_VACUUM=$(sudo -u postgres psql -t -c "
SELECT MAX(last_autovacuum) FROM pg_stat_user_tables;")
echo "- Last autovacuum: $LAST_VACUUM"
# Check if analyze is needed
LAST_ANALYZE=$(sudo -u postgres psql -t -c "
SELECT MAX(last_autoanalyze) FROM pg_stat_user_tables;")
echo "- Last autoanalyze: $LAST_ANALYZE"
# Generate overall health score
echo -e "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
if [ $CRITICAL -eq 0 ] && [ $WARNINGS -lt 3 ]; then
echo "✅ OVERALL HEALTH: GOOD"
elif [ $CRITICAL -eq 0 ] && [ $WARNINGS -lt 10 ]; then
echo "⚠️ OVERALL HEALTH: FAIR - Review warnings"
else
echo "❌ OVERALL HEALTH: POOR - Immediate action required"
fi
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# Save report
REPORT_FILE="/opt/fairdb/logs/health-check-$(date +%Y%m%d-%H%M%S).log"
echo -e "\nFull report saved to: $REPORT_FILE"
```
## Actions Based on Results
### If Critical Issues Found:
1. Check PostgreSQL service status
2. Review disk space availability
3. Verify backup integrity
4. Check for data corruption
5. Review security vulnerabilities
### If Warnings Found:
1. Schedule maintenance window
2. Plan capacity upgrades
3. Review query performance
4. Update monitoring thresholds
5. Document issues for trending
### Regular Maintenance Tasks:
1. Run VACUUM ANALYZE on large tables
2. Update table statistics
3. Review and optimize slow queries
4. Clean up old logs
5. Test backup restoration
## Schedule Next Health Check
```bash
# Schedule regular health checks
echo "30 */6 * * * root /usr/local/bin/fairdb-health-check > /dev/null 2>&1" | \
sudo tee /etc/cron.d/fairdb-health-check
echo "Health checks scheduled every 6 hours"
```