Initial commit
This commit is contained in:
230
agents/sre/scripts/health-check.sh
Executable file
230
agents/sre/scripts/health-check.sh
Executable file
@@ -0,0 +1,230 @@
|
||||
#!/bin/bash
|
||||
|
||||
# health-check.sh
|
||||
# Quick system health check across all layers
|
||||
# Usage: ./health-check.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "========================================="
|
||||
echo "SYSTEM HEALTH CHECK"
|
||||
echo "========================================="
|
||||
echo "Date: $(date)"
|
||||
echo ""
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Thresholds
|
||||
CPU_WARNING=70
|
||||
CPU_CRITICAL=90
|
||||
MEM_WARNING=80
|
||||
MEM_CRITICAL=90
|
||||
DISK_WARNING=80
|
||||
DISK_CRITICAL=90
|
||||
|
||||
# Helper function for status
|
||||
print_status() {
|
||||
local metric=$1
|
||||
local value=$2
|
||||
local warning=$3
|
||||
local critical=$4
|
||||
local unit=$5
|
||||
|
||||
if (( $(echo "$value >= $critical" | bc -l) )); then
|
||||
echo -e "${RED}✗ $metric: ${value}${unit} (CRITICAL)${NC}"
|
||||
return 2
|
||||
elif (( $(echo "$value >= $warning" | bc -l) )); then
|
||||
echo -e "${YELLOW}⚠ $metric: ${value}${unit} (WARNING)${NC}"
|
||||
return 1
|
||||
else
|
||||
echo -e "${GREEN}✓ $metric: ${value}${unit} (OK)${NC}"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# 1. CPU Check
|
||||
echo "1. CPU Usage"
|
||||
echo "-------------"
|
||||
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
|
||||
print_status "CPU" "$CPU_USAGE" "$CPU_WARNING" "$CPU_CRITICAL" "%"
|
||||
|
||||
# Top CPU processes
|
||||
echo " Top 5 CPU processes:"
|
||||
ps aux | sort -nrk 3,3 | head -5 | awk '{printf " - %s (PID %s): %.1f%%\n", $11, $2, $3}'
|
||||
echo ""
|
||||
|
||||
# 2. Memory Check
|
||||
echo "2. Memory Usage"
|
||||
echo "---------------"
|
||||
MEM_USAGE=$(free | grep Mem | awk '{print ($3/$2) * 100.0}')
|
||||
print_status "Memory" "$MEM_USAGE" "$MEM_WARNING" "$MEM_CRITICAL" "%"
|
||||
|
||||
# Memory details
|
||||
free -h | grep -E "Mem|Swap" | awk '{printf " %s: %s used / %s total\n", $1, $3, $2}'
|
||||
|
||||
# Top memory processes
|
||||
echo " Top 5 memory processes:"
|
||||
ps aux | sort -nrk 4,4 | head -5 | awk '{printf " - %s (PID %s): %.1f%%\n", $11, $2, $4}'
|
||||
echo ""
|
||||
|
||||
# 3. Disk Check
|
||||
echo "3. Disk Usage"
|
||||
echo "-------------"
|
||||
df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | while read line; do
|
||||
DISK=$(echo $line | awk '{print $1}')
|
||||
MOUNT=$(echo $line | awk '{print $6}')
|
||||
USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
|
||||
|
||||
print_status "$MOUNT" "$USAGE" "$DISK_WARNING" "$DISK_CRITICAL" "%"
|
||||
done
|
||||
|
||||
# Disk I/O
|
||||
echo " Disk I/O:"
|
||||
if command -v iostat &> /dev/null; then
|
||||
iostat -x 1 2 | tail -n +4 | awk 'NR>1 {printf " %s: %.1f%% utilization\n", $1, $NF}'
|
||||
else
|
||||
echo " (iostat not installed)"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 4. Network Check
|
||||
echo "4. Network"
|
||||
echo "----------"
|
||||
|
||||
# Check connectivity
|
||||
if ping -c 1 -W 2 8.8.8.8 &> /dev/null; then
|
||||
echo -e "${GREEN}✓ Internet connectivity: OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Internet connectivity: FAILED${NC}"
|
||||
fi
|
||||
|
||||
# DNS check
|
||||
if nslookup google.com &> /dev/null; then
|
||||
echo -e "${GREEN}✓ DNS resolution: OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ DNS resolution: FAILED${NC}"
|
||||
fi
|
||||
|
||||
# Connection count
|
||||
CONN_COUNT=$(netstat -an 2>/dev/null | grep ESTABLISHED | wc -l)
|
||||
echo " Active connections: $CONN_COUNT"
|
||||
echo ""
|
||||
|
||||
# 5. Database Check (if PostgreSQL installed)
|
||||
echo "5. Database (PostgreSQL)"
|
||||
echo "------------------------"
|
||||
if command -v psql &> /dev/null; then
|
||||
# Try to connect
|
||||
if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
|
||||
echo -e "${GREEN}✓ PostgreSQL: Running${NC}"
|
||||
|
||||
# Connection count
|
||||
CONN=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;")
|
||||
MAX_CONN=$(sudo -u postgres psql -t -c "SHOW max_connections;")
|
||||
CONN_PCT=$(echo "scale=1; $CONN / $MAX_CONN * 100" | bc)
|
||||
print_status "Connections" "$CONN_PCT" "80" "90" "% ($CONN/$MAX_CONN)"
|
||||
|
||||
# Database size
|
||||
echo " Database sizes:"
|
||||
sudo -u postgres psql -t -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;" | head -5 | awk '{printf " - %s: %s\n", $1, $3}'
|
||||
else
|
||||
echo -e "${RED}✗ PostgreSQL: Not accessible${NC}"
|
||||
fi
|
||||
else
|
||||
echo " PostgreSQL not installed"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 6. Services Check
|
||||
echo "6. Services"
|
||||
echo "-----------"
|
||||
|
||||
# List of services to check (customize as needed)
|
||||
SERVICES=("nginx" "postgresql" "redis-server")
|
||||
|
||||
for service in "${SERVICES[@]}"; do
|
||||
if systemctl is-active --quiet $service 2>/dev/null; then
|
||||
echo -e "${GREEN}✓ $service: Running${NC}"
|
||||
else
|
||||
if systemctl list-unit-files | grep -q "^$service"; then
|
||||
echo -e "${RED}✗ $service: Stopped${NC}"
|
||||
else
|
||||
echo " $service: Not installed"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 7. API Response Time (if applicable)
|
||||
echo "7. API Health"
|
||||
echo "-------------"
|
||||
|
||||
# Check localhost health endpoint
|
||||
if command -v curl &> /dev/null; then
|
||||
HEALTH_URL="http://localhost/health"
|
||||
|
||||
# Time the request
|
||||
RESPONSE=$(curl -s -w "\n%{http_code}\n%{time_total}" -o /dev/null $HEALTH_URL 2>/dev/null)
|
||||
HTTP_CODE=$(echo "$RESPONSE" | sed -n '1p')
|
||||
TIME=$(echo "$RESPONSE" | sed -n '2p')
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
TIME_MS=$(echo "$TIME * 1000" | bc)
|
||||
echo -e "${GREEN}✓ Health endpoint: Responding (${TIME_MS}ms)${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Health endpoint: Failed (HTTP $HTTP_CODE)${NC}"
|
||||
fi
|
||||
else
|
||||
echo " curl not installed"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 8. Load Average
|
||||
echo "8. Load Average"
|
||||
echo "---------------"
|
||||
LOAD=$(uptime | awk -F'load average:' '{ print $2 }')
|
||||
CORES=$(nproc)
|
||||
echo " Load: $LOAD"
|
||||
echo " CPU cores: $CORES"
|
||||
LOAD_1MIN=$(echo $LOAD | awk -F', ' '{print $1}' | xargs)
|
||||
LOAD_PER_CORE=$(echo "scale=2; $LOAD_1MIN / $CORES" | bc)
|
||||
|
||||
if (( $(echo "$LOAD_PER_CORE >= 2.0" | bc -l) )); then
|
||||
echo -e "${RED}✗ Load per core: ${LOAD_PER_CORE} (HIGH)${NC}"
|
||||
elif (( $(echo "$LOAD_PER_CORE >= 1.0" | bc -l) )); then
|
||||
echo -e "${YELLOW}⚠ Load per core: ${LOAD_PER_CORE} (ELEVATED)${NC}"
|
||||
else
|
||||
echo -e "${GREEN}✓ Load per core: ${LOAD_PER_CORE} (OK)${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 9. Recent Errors
|
||||
echo "9. Recent Errors (last 10 minutes)"
|
||||
echo "-----------------------------------"
|
||||
if [ -f /var/log/syslog ]; then
|
||||
ERROR_COUNT=$(grep -c "error\|Error\|ERROR" /var/log/syslog 2>/dev/null | tail -1000 || echo 0)
|
||||
echo " Syslog errors: $ERROR_COUNT"
|
||||
fi
|
||||
|
||||
# Check journal if systemd
|
||||
if command -v journalctl &> /dev/null; then
|
||||
JOURNAL_ERRORS=$(journalctl --since "10 minutes ago" --priority=err --no-pager | wc -l)
|
||||
echo " Journalctl errors: $JOURNAL_ERRORS"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
echo "========================================="
|
||||
echo "SUMMARY"
|
||||
echo "========================================="
|
||||
echo "Health check completed at $(date)"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "- If any CRITICAL issues, investigate immediately"
|
||||
echo "- If WARNING issues, monitor and plan mitigation"
|
||||
echo "- Review playbooks: ../playbooks/"
|
||||
echo ""
|
||||
213
agents/sre/scripts/log-analyzer.py
Executable file
213
agents/sre/scripts/log-analyzer.py
Executable file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
log-analyzer.py
|
||||
Parse application/system logs for error patterns and anomalies
|
||||
|
||||
Usage: python3 log-analyzer.py /var/log/application.log
|
||||
python3 log-analyzer.py /var/log/application.log --errors-only
|
||||
python3 log-analyzer.py /var/log/application.log --since "2025-10-26 14:00"
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime, timedelta
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Analyze log files for errors and patterns')
|
||||
parser.add_argument('logfile', help='Path to log file')
|
||||
parser.add_argument('--errors-only', action='store_true', help='Show only errors (ERROR, FATAL)')
|
||||
parser.add_argument('--warnings', action='store_true', help='Include warnings')
|
||||
parser.add_argument('--since', help='Show logs since timestamp (YYYY-MM-DD HH:MM)')
|
||||
parser.add_argument('--until', help='Show logs until timestamp (YYYY-MM-DD HH:MM)')
|
||||
parser.add_argument('--pattern', help='Search for specific pattern (regex)')
|
||||
parser.add_argument('--top', type=int, default=10, help='Show top N errors (default: 10)')
|
||||
return parser.parse_args()
|
||||
|
||||
def parse_log_line(line):
|
||||
"""Parse common log formats"""
|
||||
# Try different log formats
|
||||
patterns = [
|
||||
# JSON: {"timestamp":"2025-10-26T14:00:00Z","level":"ERROR","message":"..."}
|
||||
r'\{"timestamp":"(?P<timestamp>[^"]+)".*"level":"(?P<level>[^"]+)".*"message":"(?P<message>[^"]+)"',
|
||||
|
||||
# Standard: [2025-10-26 14:00:00] ERROR: message
|
||||
r'\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]\s+(?P<level>\w+):\s+(?P<message>.*)',
|
||||
|
||||
# Syslog: Oct 26 14:00:00 hostname application[1234]: ERROR message
|
||||
r'(?P<timestamp>\w+ \d+ \d{2}:\d{2}:\d{2})\s+\S+\s+\S+:\s+(?P<level>\w+)\s+(?P<message>.*)',
|
||||
|
||||
# Simple: 2025-10-26 14:00:00 ERROR message
|
||||
r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?P<level>\w+)\s+(?P<message>.*)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.match(pattern, line)
|
||||
if match:
|
||||
return match.groupdict()
|
||||
|
||||
# If no pattern matched, return raw line
|
||||
return {'timestamp': None, 'level': 'INFO', 'message': line.strip()}
|
||||
|
||||
def parse_timestamp(ts_str):
|
||||
"""Parse various timestamp formats"""
|
||||
if not ts_str:
|
||||
return None
|
||||
|
||||
formats = [
|
||||
'%Y-%m-%dT%H:%M:%SZ',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%b %d %H:%M:%S',
|
||||
]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(ts_str, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Parse filters
|
||||
since = datetime.strptime(args.since, '%Y-%m-%d %H:%M') if args.since else None
|
||||
until = datetime.strptime(args.until, '%Y-%m-%d %H:%M') if args.until else None
|
||||
|
||||
# Stats
|
||||
total_lines = 0
|
||||
error_count = 0
|
||||
warning_count = 0
|
||||
error_messages = Counter()
|
||||
errors_by_hour = defaultdict(int)
|
||||
error_timeline = []
|
||||
|
||||
print(f"Analyzing log file: {args.logfile}")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
try:
|
||||
with open(args.logfile, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
for line in f:
|
||||
total_lines += 1
|
||||
|
||||
# Parse log line
|
||||
parsed = parse_log_line(line)
|
||||
level = parsed.get('level', '').upper()
|
||||
message = parsed.get('message', '')
|
||||
timestamp = parse_timestamp(parsed.get('timestamp'))
|
||||
|
||||
# Filter by time range
|
||||
if since and timestamp and timestamp < since:
|
||||
continue
|
||||
if until and timestamp and timestamp > until:
|
||||
continue
|
||||
|
||||
# Filter by pattern
|
||||
if args.pattern and not re.search(args.pattern, message, re.IGNORECASE):
|
||||
continue
|
||||
|
||||
# Filter by level
|
||||
if args.errors_only and level not in ['ERROR', 'FATAL', 'CRITICAL']:
|
||||
continue
|
||||
|
||||
# Count errors and warnings
|
||||
if level in ['ERROR', 'FATAL', 'CRITICAL']:
|
||||
error_count += 1
|
||||
|
||||
# Extract error message (first 100 chars)
|
||||
error_key = message[:100] if len(message) > 100 else message
|
||||
error_messages[error_key] += 1
|
||||
|
||||
# Group by hour
|
||||
if timestamp:
|
||||
hour_key = timestamp.strftime('%Y-%m-%d %H:00')
|
||||
errors_by_hour[hour_key] += 1
|
||||
error_timeline.append((timestamp, message))
|
||||
|
||||
elif level in ['WARN', 'WARNING'] and args.warnings:
|
||||
warning_count += 1
|
||||
|
||||
# Print summary
|
||||
print(f"📊 SUMMARY")
|
||||
print(f"---------")
|
||||
print(f"Total lines: {total_lines:,}")
|
||||
print(f"Errors: {error_count:,}")
|
||||
if args.warnings:
|
||||
print(f"Warnings: {warning_count:,}")
|
||||
print()
|
||||
|
||||
# Top errors
|
||||
if error_messages:
|
||||
print(f"🔥 TOP {args.top} ERRORS")
|
||||
print(f"{'Count':<10} {'Message':<70}")
|
||||
print("-" * 80)
|
||||
for msg, count in error_messages.most_common(args.top):
|
||||
msg_short = (msg[:67] + '...') if len(msg) > 70 else msg
|
||||
print(f"{count:<10} {msg_short}")
|
||||
print()
|
||||
|
||||
# Errors by hour
|
||||
if errors_by_hour:
|
||||
print(f"📈 ERRORS BY HOUR")
|
||||
print(f"{'Hour':<20} {'Count':<10} {'Graph':<50}")
|
||||
print("-" * 80)
|
||||
|
||||
max_errors = max(errors_by_hour.values())
|
||||
for hour in sorted(errors_by_hour.keys()):
|
||||
count = errors_by_hour[hour]
|
||||
bar_length = int((count / max_errors) * 40)
|
||||
bar = '█' * bar_length
|
||||
print(f"{hour:<20} {count:<10} {bar}")
|
||||
print()
|
||||
|
||||
# Error timeline (last 20)
|
||||
if error_timeline:
|
||||
print(f"⏱️ ERROR TIMELINE (Last 20)")
|
||||
print(f"{'Timestamp':<20} {'Message':<60}")
|
||||
print("-" * 80)
|
||||
|
||||
for timestamp, message in sorted(error_timeline, reverse=True)[:20]:
|
||||
ts_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
msg_short = (message[:57] + '...') if len(message) > 60 else message
|
||||
print(f"{ts_str:<20} {msg_short}")
|
||||
print()
|
||||
|
||||
# Recommendations
|
||||
print(f"💡 RECOMMENDATIONS")
|
||||
print(f"-----------------")
|
||||
|
||||
if error_count == 0:
|
||||
print("✅ No errors found. System looks healthy!")
|
||||
elif error_count < 10:
|
||||
print(f"⚠️ {error_count} errors found. Review above for details.")
|
||||
elif error_count < 100:
|
||||
print(f"⚠️ {error_count} errors found. Investigate top errors.")
|
||||
else:
|
||||
print(f"🚨 {error_count} errors found! Immediate investigation required.")
|
||||
print(" - Check for cascading failures")
|
||||
print(" - Review error timeline for spike")
|
||||
print(" - Check related services")
|
||||
|
||||
if errors_by_hour:
|
||||
# Find hour with most errors
|
||||
peak_hour = max(errors_by_hour.items(), key=lambda x: x[1])
|
||||
print(f"\n📍 Peak error hour: {peak_hour[0]} ({peak_hour[1]} errors)")
|
||||
print(f" - Review what happened at this time")
|
||||
print(f" - Check deployment, traffic spike, external dependency")
|
||||
|
||||
print()
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Error: Log file not found: {args.logfile}")
|
||||
sys.exit(1)
|
||||
except PermissionError:
|
||||
print(f"❌ Error: Permission denied: {args.logfile}")
|
||||
print(f" Try: sudo python3 {sys.argv[0]} {args.logfile}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
294
agents/sre/scripts/metrics-collector.sh
Executable file
294
agents/sre/scripts/metrics-collector.sh
Executable file
@@ -0,0 +1,294 @@
|
||||
#!/bin/bash
|
||||
|
||||
# metrics-collector.sh
|
||||
# Gather system metrics for incident diagnosis
|
||||
# Usage: ./metrics-collector.sh [output_file]
|
||||
|
||||
set -e
|
||||
|
||||
OUTPUT_FILE=${1:-"metrics-$(date +%Y%m%d-%H%M%S).txt"}
|
||||
|
||||
echo "Collecting system metrics..."
|
||||
echo "Output: $OUTPUT_FILE"
|
||||
echo ""
|
||||
|
||||
{
|
||||
echo "========================================="
|
||||
echo "SYSTEM METRICS COLLECTION"
|
||||
echo "========================================="
|
||||
echo "Date: $(date)"
|
||||
echo "Hostname: $(hostname)"
|
||||
echo "Uptime: $(uptime -p 2>/dev/null || uptime)"
|
||||
echo ""
|
||||
|
||||
# 1. CPU Metrics
|
||||
echo "========================================="
|
||||
echo "1. CPU METRICS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "CPU Info:"
|
||||
lscpu | grep -E "^Model name|^CPU\(s\)|^Thread|^Core|^Socket"
|
||||
echo ""
|
||||
|
||||
echo "CPU Usage (snapshot):"
|
||||
top -bn1 | head -20
|
||||
echo ""
|
||||
|
||||
echo "Load Average:"
|
||||
uptime
|
||||
echo ""
|
||||
|
||||
if command -v mpstat &> /dev/null; then
|
||||
echo "CPU by Core:"
|
||||
mpstat -P ALL 1 1
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# 2. Memory Metrics
|
||||
echo "========================================="
|
||||
echo "2. MEMORY METRICS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "Memory Overview:"
|
||||
free -h
|
||||
echo ""
|
||||
|
||||
echo "Memory Details:"
|
||||
cat /proc/meminfo | head -20
|
||||
echo ""
|
||||
|
||||
echo "Top Memory Processes:"
|
||||
ps aux | sort -nrk 4,4 | head -10
|
||||
echo ""
|
||||
|
||||
# 3. Disk Metrics
|
||||
echo "========================================="
|
||||
echo "3. DISK METRICS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "Disk Usage:"
|
||||
df -h
|
||||
echo ""
|
||||
|
||||
echo "Inode Usage:"
|
||||
df -i
|
||||
echo ""
|
||||
|
||||
if command -v iostat &> /dev/null; then
|
||||
echo "Disk I/O Stats:"
|
||||
iostat -x 1 5
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "Disk Space by Directory (/):"
|
||||
du -sh /* 2>/dev/null | sort -hr | head -20
|
||||
echo ""
|
||||
|
||||
# 4. Network Metrics
|
||||
echo "========================================="
|
||||
echo "4. NETWORK METRICS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "Network Interfaces:"
|
||||
ip addr show
|
||||
echo ""
|
||||
|
||||
echo "Network Statistics:"
|
||||
netstat -s | head -50
|
||||
echo ""
|
||||
|
||||
echo "Active Connections:"
|
||||
netstat -an | grep ESTABLISHED | wc -l
|
||||
echo ""
|
||||
|
||||
echo "Top 10 IPs by Connection Count:"
|
||||
netstat -ntu | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10
|
||||
echo ""
|
||||
|
||||
if command -v ss &> /dev/null; then
|
||||
echo "Socket Stats:"
|
||||
ss -s
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# 5. Process Metrics
|
||||
echo "========================================="
|
||||
echo "5. PROCESS METRICS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "Process Count:"
|
||||
ps aux | wc -l
|
||||
echo ""
|
||||
|
||||
echo "Top CPU Processes:"
|
||||
ps aux | sort -nrk 3,3 | head -10
|
||||
echo ""
|
||||
|
||||
echo "Top Memory Processes:"
|
||||
ps aux | sort -nrk 4,4 | head -10
|
||||
echo ""
|
||||
|
||||
echo "Zombie Processes:"
|
||||
ps aux | grep -E "<defunct>|Z" | grep -v grep
|
||||
echo ""
|
||||
|
||||
# 6. Database Metrics (PostgreSQL)
|
||||
echo "========================================="
|
||||
echo "6. DATABASE METRICS (PostgreSQL)"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
if command -v psql &> /dev/null; then
|
||||
if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
|
||||
echo "PostgreSQL Connection Count:"
|
||||
sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;"
|
||||
echo ""
|
||||
|
||||
echo "PostgreSQL Max Connections:"
|
||||
sudo -u postgres psql -t -c "SHOW max_connections;"
|
||||
echo ""
|
||||
|
||||
echo "PostgreSQL Active Queries:"
|
||||
sudo -u postgres psql -x -c "SELECT pid, usename, application_name, state, query FROM pg_stat_activity WHERE state != 'idle' LIMIT 10;"
|
||||
echo ""
|
||||
|
||||
echo "PostgreSQL Database Sizes:"
|
||||
sudo -u postgres psql -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;"
|
||||
echo ""
|
||||
|
||||
echo "PostgreSQL Table Sizes (top 10):"
|
||||
sudo -u postgres psql -c "SELECT schemaname, tablename, pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size FROM pg_tables ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC LIMIT 10;"
|
||||
echo ""
|
||||
|
||||
if command -v pg_stat_statements &> /dev/null; then
|
||||
echo "PostgreSQL Slow Queries (top 5):"
|
||||
sudo -u postgres psql -c "SELECT query, calls, total_exec_time, mean_exec_time FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 5;"
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
echo "PostgreSQL not accessible"
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
echo "PostgreSQL not installed"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# 7. Web Server Metrics (nginx)
|
||||
echo "========================================="
|
||||
echo "7. WEB SERVER METRICS (nginx)"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
if systemctl is-active --quiet nginx 2>/dev/null; then
|
||||
echo "Nginx Status: Running"
|
||||
|
||||
if [ -f /var/log/nginx/access.log ]; then
|
||||
echo ""
|
||||
echo "Nginx Request Count (last 1000 lines):"
|
||||
tail -1000 /var/log/nginx/access.log | wc -l
|
||||
|
||||
echo ""
|
||||
echo "Nginx Status Codes (last 1000 lines):"
|
||||
tail -1000 /var/log/nginx/access.log | awk '{print $9}' | sort | uniq -c | sort -nr
|
||||
|
||||
echo ""
|
||||
echo "Nginx Top 10 URLs:"
|
||||
tail -1000 /var/log/nginx/access.log | awk '{print $7}' | sort | uniq -c | sort -nr | head -10
|
||||
|
||||
echo ""
|
||||
echo "Nginx Top 10 IPs:"
|
||||
tail -1000 /var/log/nginx/access.log | awk '{print $1}' | sort | uniq -c | sort -nr | head -10
|
||||
fi
|
||||
else
|
||||
echo "Nginx not running"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 8. Application Metrics (customize as needed)
|
||||
echo "========================================="
|
||||
echo "8. APPLICATION METRICS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "Application Processes:"
|
||||
ps aux | grep -E "node|java|python|ruby" | grep -v grep
|
||||
echo ""
|
||||
|
||||
echo "Application Ports:"
|
||||
netstat -tlnp 2>/dev/null | grep -E "node|java|python|ruby"
|
||||
echo ""
|
||||
|
||||
# 9. System Logs (recent errors)
|
||||
echo "========================================="
|
||||
echo "9. RECENT SYSTEM ERRORS"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "Recent Syslog Errors (last 50):"
|
||||
if [ -f /var/log/syslog ]; then
|
||||
grep -i "error\|fail\|critical" /var/log/syslog | tail -50
|
||||
else
|
||||
echo "Syslog not found"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "Recent Journal Errors (last 10 minutes):"
|
||||
if command -v journalctl &> /dev/null; then
|
||||
journalctl --since "10 minutes ago" --priority=err --no-pager | tail -50
|
||||
else
|
||||
echo "journalctl not available"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 10. System Info
|
||||
echo "========================================="
|
||||
echo "10. SYSTEM INFORMATION"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "OS Version:"
|
||||
cat /etc/os-release 2>/dev/null || uname -a
|
||||
echo ""
|
||||
|
||||
echo "Kernel Version:"
|
||||
uname -r
|
||||
echo ""
|
||||
|
||||
echo "System Time:"
|
||||
date
|
||||
echo ""
|
||||
|
||||
echo "Timezone:"
|
||||
timedatectl 2>/dev/null || cat /etc/timezone
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
echo "========================================="
|
||||
echo "COLLECTION COMPLETE"
|
||||
echo "========================================="
|
||||
echo "Collected at: $(date)"
|
||||
echo "Metrics saved to: $OUTPUT_FILE"
|
||||
echo ""
|
||||
|
||||
} > "$OUTPUT_FILE" 2>&1
|
||||
|
||||
# Print summary to console
|
||||
echo ""
|
||||
echo "✅ Metrics collection complete!"
|
||||
echo ""
|
||||
echo "Summary:"
|
||||
grep -E "CPU Usage|Memory Overview|Disk Usage|Active Connections|PostgreSQL Connection Count" "$OUTPUT_FILE" | head -20
|
||||
echo ""
|
||||
echo "Full report: $OUTPUT_FILE"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " - Review metrics for anomalies"
|
||||
echo " - Compare with baseline metrics"
|
||||
echo " - Share with team for analysis"
|
||||
echo ""
|
||||
257
agents/sre/scripts/trace-analyzer.js
Executable file
257
agents/sre/scripts/trace-analyzer.js
Executable file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* trace-analyzer.js
|
||||
* Analyze distributed tracing data to identify bottlenecks
|
||||
*
|
||||
* Usage: node trace-analyzer.js <trace-id>
|
||||
* node trace-analyzer.js <trace-id> --format=json
|
||||
* node trace-analyzer.js --file=trace.json
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// Parse arguments
|
||||
const args = process.argv.slice(2);
|
||||
let traceId = null;
|
||||
let traceFile = null;
|
||||
let outputFormat = 'text'; // text or json
|
||||
|
||||
for (const arg of args) {
|
||||
if (arg.startsWith('--file=')) {
|
||||
traceFile = arg.split('=')[1];
|
||||
} else if (arg.startsWith('--format=')) {
|
||||
outputFormat = arg.split('=')[1];
|
||||
} else if (!arg.startsWith('--')) {
|
||||
traceId = arg;
|
||||
}
|
||||
}
|
||||
|
||||
// Mock trace data (in production, fetch from APM/tracing system)
|
||||
function getMockTraceData(id) {
|
||||
return {
|
||||
traceId: id,
|
||||
rootSpan: {
|
||||
spanId: 'span-1',
|
||||
service: 'frontend',
|
||||
operation: 'GET /dashboard',
|
||||
startTime: 1698345600000,
|
||||
duration: 8250, // ms
|
||||
children: [
|
||||
{
|
||||
spanId: 'span-2',
|
||||
service: 'api',
|
||||
operation: 'GET /api/dashboard',
|
||||
startTime: 1698345600010,
|
||||
duration: 8200,
|
||||
children: [
|
||||
{
|
||||
spanId: 'span-3',
|
||||
service: 'api',
|
||||
operation: 'db.query',
|
||||
startTime: 1698345600020,
|
||||
duration: 7800, // SLOW!
|
||||
tags: {
|
||||
'db.statement': 'SELECT * FROM users WHERE last_login_at > ...',
|
||||
'db.type': 'postgresql',
|
||||
},
|
||||
children: [],
|
||||
},
|
||||
{
|
||||
spanId: 'span-4',
|
||||
service: 'api',
|
||||
operation: 'cache.get',
|
||||
startTime: 1698345608200,
|
||||
duration: 5,
|
||||
children: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Load trace from file or mock
|
||||
function loadTrace() {
|
||||
if (traceFile) {
|
||||
try {
|
||||
const data = fs.readFileSync(traceFile, 'utf8');
|
||||
return JSON.parse(data);
|
||||
} catch (error) {
|
||||
console.error(`❌ Error loading trace file: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} else if (traceId) {
|
||||
return getMockTraceData(traceId);
|
||||
} else {
|
||||
console.error('Usage: node trace-analyzer.js <trace-id> OR --file=trace.json');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze trace
|
||||
function analyzeTrace(trace) {
|
||||
const analysis = {
|
||||
traceId: trace.traceId,
|
||||
totalDuration: trace.rootSpan.duration,
|
||||
rootOperation: trace.rootSpan.operation,
|
||||
spanCount: 0,
|
||||
slowSpans: [],
|
||||
bottlenecks: [],
|
||||
serviceBreakdown: {},
|
||||
};
|
||||
|
||||
// Traverse spans
|
||||
function traverseSpans(span, depth = 0) {
|
||||
analysis.spanCount++;
|
||||
|
||||
// Track service time
|
||||
if (!analysis.serviceBreakdown[span.service]) {
|
||||
analysis.serviceBreakdown[span.service] = {
|
||||
totalTime: 0,
|
||||
calls: 0,
|
||||
};
|
||||
}
|
||||
analysis.serviceBreakdown[span.service].totalTime += span.duration;
|
||||
analysis.serviceBreakdown[span.service].calls++;
|
||||
|
||||
// Identify slow spans (>1s)
|
||||
if (span.duration > 1000) {
|
||||
analysis.slowSpans.push({
|
||||
service: span.service,
|
||||
operation: span.operation,
|
||||
duration: span.duration,
|
||||
percentage: ((span.duration / analysis.totalDuration) * 100).toFixed(1),
|
||||
depth,
|
||||
});
|
||||
}
|
||||
|
||||
// Traverse children
|
||||
if (span.children) {
|
||||
span.children.forEach(child => traverseSpans(child, depth + 1));
|
||||
}
|
||||
}
|
||||
|
||||
traverseSpans(trace.rootSpan);
|
||||
|
||||
// Sort slow spans by duration
|
||||
analysis.slowSpans.sort((a, b) => b.duration - a.duration);
|
||||
|
||||
// Identify bottlenecks (spans taking >50% of total time)
|
||||
analysis.bottlenecks = analysis.slowSpans.filter(
|
||||
span => parseFloat(span.percentage) > 50
|
||||
);
|
||||
|
||||
return analysis;
|
||||
}
|
||||
|
||||
// Format duration
|
||||
function formatDuration(ms) {
|
||||
if (ms < 1000) return `${ms}ms`;
|
||||
return `${(ms / 1000).toFixed(2)}s`;
|
||||
}
|
||||
|
||||
// Print analysis (text format)
|
||||
function printAnalysis(analysis) {
|
||||
console.log('========================================');
|
||||
console.log('DISTRIBUTED TRACE ANALYSIS');
|
||||
console.log('========================================');
|
||||
console.log(`Trace ID: ${analysis.traceId}`);
|
||||
console.log(`Root Operation: ${analysis.rootOperation}`);
|
||||
console.log(`Total Duration: ${formatDuration(analysis.totalDuration)}`);
|
||||
console.log(`Total Spans: ${analysis.spanCount}`);
|
||||
console.log('');
|
||||
|
||||
// Service breakdown
|
||||
console.log('📊 SERVICE BREAKDOWN');
|
||||
console.log('-------------------');
|
||||
console.log(`${'Service'.padEnd(20)} ${'Time'.padEnd(15)} ${'Calls'.padEnd(10)} ${'% of Total'.padEnd(15)}`);
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
for (const [service, data] of Object.entries(analysis.serviceBreakdown)) {
|
||||
const percentage = ((data.totalTime / analysis.totalDuration) * 100).toFixed(1);
|
||||
console.log(
|
||||
`${service.padEnd(20)} ${formatDuration(data.totalTime).padEnd(15)} ${String(data.calls).padEnd(10)} ${percentage}%`
|
||||
);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
// Slow spans
|
||||
if (analysis.slowSpans.length > 0) {
|
||||
console.log(`🐌 SLOW SPANS (>${formatDuration(1000)})`);
|
||||
console.log('-------------------');
|
||||
console.log(`${'Service'.padEnd(15)} ${'Operation'.padEnd(30)} ${'Duration'.padEnd(15)} ${'% of Total'.padEnd(15)}`);
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
for (const span of analysis.slowSpans.slice(0, 10)) {
|
||||
console.log(
|
||||
`${span.service.padEnd(15)} ${span.operation.padEnd(30)} ${formatDuration(span.duration).padEnd(15)} ${span.percentage}%`
|
||||
);
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Bottlenecks
|
||||
if (analysis.bottlenecks.length > 0) {
|
||||
console.log('🚨 BOTTLENECKS (>50% of total time)');
|
||||
console.log('-----------------------------------');
|
||||
|
||||
for (const bottleneck of analysis.bottlenecks) {
|
||||
console.log(`⚠️ ${bottleneck.service} - ${bottleneck.operation}`);
|
||||
console.log(` Duration: ${formatDuration(bottleneck.duration)} (${bottleneck.percentage}% of trace)`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
// Recommendations
|
||||
console.log('💡 RECOMMENDATIONS');
|
||||
console.log('-----------------');
|
||||
|
||||
if (analysis.bottlenecks.length > 0) {
|
||||
console.log('🔴 CRITICAL: Bottlenecks detected!');
|
||||
for (const bottleneck of analysis.bottlenecks) {
|
||||
console.log(` - Optimize ${bottleneck.service}.${bottleneck.operation} (${bottleneck.percentage}% of trace)`);
|
||||
|
||||
// Specific recommendations based on operation
|
||||
if (bottleneck.operation.includes('db.query')) {
|
||||
console.log(' → Add database index, optimize query, add caching');
|
||||
} else if (bottleneck.operation.includes('http')) {
|
||||
console.log(' → Add timeout, cache response, use async processing');
|
||||
} else if (bottleneck.operation.includes('cache')) {
|
||||
console.log(' → Check cache hit rate, optimize cache key');
|
||||
}
|
||||
}
|
||||
} else if (analysis.slowSpans.length > 0) {
|
||||
console.log('🟡 Some slow spans detected:');
|
||||
for (const span of analysis.slowSpans.slice(0, 3)) {
|
||||
console.log(` - ${span.service}.${span.operation}: ${formatDuration(span.duration)}`);
|
||||
}
|
||||
} else {
|
||||
console.log('✅ No obvious performance issues detected.');
|
||||
console.log(' All spans complete in reasonable time.');
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('Next steps:');
|
||||
console.log(' - Profile slowest spans');
|
||||
console.log(' - Check for N+1 queries, missing indexes');
|
||||
console.log(' - Add caching where appropriate');
|
||||
console.log(' - Review external API timeouts');
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Main
|
||||
function main() {
|
||||
const trace = loadTrace();
|
||||
const analysis = analyzeTrace(trace);
|
||||
|
||||
if (outputFormat === 'json') {
|
||||
console.log(JSON.stringify(analysis, null, 2));
|
||||
} else {
|
||||
printAnalysis(analysis);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user