Files
gh-anton-abyzov-specweave-p…/agents/sre/scripts/metrics-collector.sh
2025-11-29 17:56:41 +08:00

295 lines
7.9 KiB
Bash
Executable File

#!/bin/bash
# metrics-collector.sh
# Gather system metrics for incident diagnosis
# Usage: ./metrics-collector.sh [output_file]
set -e
OUTPUT_FILE=${1:-"metrics-$(date +%Y%m%d-%H%M%S).txt"}
echo "Collecting system metrics..."
echo "Output: $OUTPUT_FILE"
echo ""
{
echo "========================================="
echo "SYSTEM METRICS COLLECTION"
echo "========================================="
echo "Date: $(date)"
echo "Hostname: $(hostname)"
echo "Uptime: $(uptime -p 2>/dev/null || uptime)"
echo ""
# 1. CPU Metrics
echo "========================================="
echo "1. CPU METRICS"
echo "========================================="
echo ""
echo "CPU Info:"
lscpu | grep -E "^Model name|^CPU\(s\)|^Thread|^Core|^Socket"
echo ""
echo "CPU Usage (snapshot):"
top -bn1 | head -20
echo ""
echo "Load Average:"
uptime
echo ""
if command -v mpstat &> /dev/null; then
echo "CPU by Core:"
mpstat -P ALL 1 1
echo ""
fi
# 2. Memory Metrics
echo "========================================="
echo "2. MEMORY METRICS"
echo "========================================="
echo ""
echo "Memory Overview:"
free -h
echo ""
echo "Memory Details:"
cat /proc/meminfo | head -20
echo ""
echo "Top Memory Processes:"
ps aux | sort -nrk 4,4 | head -10
echo ""
# 3. Disk Metrics
echo "========================================="
echo "3. DISK METRICS"
echo "========================================="
echo ""
echo "Disk Usage:"
df -h
echo ""
echo "Inode Usage:"
df -i
echo ""
if command -v iostat &> /dev/null; then
echo "Disk I/O Stats:"
iostat -x 1 5
echo ""
fi
echo "Disk Space by Directory (/):"
du -sh /* 2>/dev/null | sort -hr | head -20
echo ""
# 4. Network Metrics
echo "========================================="
echo "4. NETWORK METRICS"
echo "========================================="
echo ""
echo "Network Interfaces:"
ip addr show
echo ""
echo "Network Statistics:"
netstat -s | head -50
echo ""
echo "Active Connections:"
netstat -an | grep ESTABLISHED | wc -l
echo ""
echo "Top 10 IPs by Connection Count:"
netstat -ntu | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10
echo ""
if command -v ss &> /dev/null; then
echo "Socket Stats:"
ss -s
echo ""
fi
# 5. Process Metrics
echo "========================================="
echo "5. PROCESS METRICS"
echo "========================================="
echo ""
echo "Process Count:"
ps aux | wc -l
echo ""
echo "Top CPU Processes:"
ps aux | sort -nrk 3,3 | head -10
echo ""
echo "Top Memory Processes:"
ps aux | sort -nrk 4,4 | head -10
echo ""
echo "Zombie Processes:"
ps aux | grep -E "<defunct>|Z" | grep -v grep
echo ""
# 6. Database Metrics (PostgreSQL)
echo "========================================="
echo "6. DATABASE METRICS (PostgreSQL)"
echo "========================================="
echo ""
if command -v psql &> /dev/null; then
if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
echo "PostgreSQL Connection Count:"
sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;"
echo ""
echo "PostgreSQL Max Connections:"
sudo -u postgres psql -t -c "SHOW max_connections;"
echo ""
echo "PostgreSQL Active Queries:"
sudo -u postgres psql -x -c "SELECT pid, usename, application_name, state, query FROM pg_stat_activity WHERE state != 'idle' LIMIT 10;"
echo ""
echo "PostgreSQL Database Sizes:"
sudo -u postgres psql -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;"
echo ""
echo "PostgreSQL Table Sizes (top 10):"
sudo -u postgres psql -c "SELECT schemaname, tablename, pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size FROM pg_tables ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC LIMIT 10;"
echo ""
if command -v pg_stat_statements &> /dev/null; then
echo "PostgreSQL Slow Queries (top 5):"
sudo -u postgres psql -c "SELECT query, calls, total_exec_time, mean_exec_time FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 5;"
echo ""
fi
else
echo "PostgreSQL not accessible"
echo ""
fi
else
echo "PostgreSQL not installed"
echo ""
fi
# 7. Web Server Metrics (nginx)
echo "========================================="
echo "7. WEB SERVER METRICS (nginx)"
echo "========================================="
echo ""
if systemctl is-active --quiet nginx 2>/dev/null; then
echo "Nginx Status: Running"
if [ -f /var/log/nginx/access.log ]; then
echo ""
echo "Nginx Request Count (last 1000 lines):"
tail -1000 /var/log/nginx/access.log | wc -l
echo ""
echo "Nginx Status Codes (last 1000 lines):"
tail -1000 /var/log/nginx/access.log | awk '{print $9}' | sort | uniq -c | sort -nr
echo ""
echo "Nginx Top 10 URLs:"
tail -1000 /var/log/nginx/access.log | awk '{print $7}' | sort | uniq -c | sort -nr | head -10
echo ""
echo "Nginx Top 10 IPs:"
tail -1000 /var/log/nginx/access.log | awk '{print $1}' | sort | uniq -c | sort -nr | head -10
fi
else
echo "Nginx not running"
fi
echo ""
# 8. Application Metrics (customize as needed)
echo "========================================="
echo "8. APPLICATION METRICS"
echo "========================================="
echo ""
echo "Application Processes:"
ps aux | grep -E "node|java|python|ruby" | grep -v grep
echo ""
echo "Application Ports:"
netstat -tlnp 2>/dev/null | grep -E "node|java|python|ruby"
echo ""
# 9. System Logs (recent errors)
echo "========================================="
echo "9. RECENT SYSTEM ERRORS"
echo "========================================="
echo ""
echo "Recent Syslog Errors (last 50):"
if [ -f /var/log/syslog ]; then
grep -i "error\|fail\|critical" /var/log/syslog | tail -50
else
echo "Syslog not found"
fi
echo ""
echo "Recent Journal Errors (last 10 minutes):"
if command -v journalctl &> /dev/null; then
journalctl --since "10 minutes ago" --priority=err --no-pager | tail -50
else
echo "journalctl not available"
fi
echo ""
# 10. System Info
echo "========================================="
echo "10. SYSTEM INFORMATION"
echo "========================================="
echo ""
echo "OS Version:"
cat /etc/os-release 2>/dev/null || uname -a
echo ""
echo "Kernel Version:"
uname -r
echo ""
echo "System Time:"
date
echo ""
echo "Timezone:"
timedatectl 2>/dev/null || cat /etc/timezone
echo ""
# Summary
echo "========================================="
echo "COLLECTION COMPLETE"
echo "========================================="
echo "Collected at: $(date)"
echo "Metrics saved to: $OUTPUT_FILE"
echo ""
} > "$OUTPUT_FILE" 2>&1
# Print summary to console
echo ""
echo "✅ Metrics collection complete!"
echo ""
echo "Summary:"
grep -E "CPU Usage|Memory Overview|Disk Usage|Active Connections|PostgreSQL Connection Count" "$OUTPUT_FILE" | head -20
echo ""
echo "Full report: $OUTPUT_FILE"
echo ""
echo "Next steps:"
echo " - Review metrics for anomalies"
echo " - Compare with baseline metrics"
echo " - Share with team for analysis"
echo ""