Initial commit

2025-11-29 17:56:41 +08:00
commit 9427ed1eea
40 changed files with 15189 additions and 0 deletions
--- a/agents/sre/scripts/health-check.sh
+++ b/agents/sre/scripts/health-check.sh
@@ -0,0 +1,230 @@
+#!/bin/bash
+
+# health-check.sh
+# Quick system health check across all layers
+# Usage: ./health-check.sh
+
+set -e
+
+echo "========================================="
+echo "SYSTEM HEALTH CHECK"
+echo "========================================="
+echo "Date: $(date)"
+echo ""
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Thresholds
+CPU_WARNING=70
+CPU_CRITICAL=90
+MEM_WARNING=80
+MEM_CRITICAL=90
+DISK_WARNING=80
+DISK_CRITICAL=90
+
+# Helper function for status
+print_status() {
+    local metric=$1
+    local value=$2
+    local warning=$3
+    local critical=$4
+    local unit=$5
+
+    if (( $(echo "$value >= $critical" | bc -l) )); then
+        echo -e "${RED}✗ $metric: ${value}${unit} (CRITICAL)${NC}"
+        return 2
+    elif (( $(echo "$value >= $warning" | bc -l) )); then
+        echo -e "${YELLOW}⚠ $metric: ${value}${unit} (WARNING)${NC}"
+        return 1
+    else
+        echo -e "${GREEN}✓ $metric: ${value}${unit} (OK)${NC}"
+        return 0
+    fi
+}
+
+# 1. CPU Check
+echo "1. CPU Usage"
+echo "-------------"
+CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
+print_status "CPU" "$CPU_USAGE" "$CPU_WARNING" "$CPU_CRITICAL" "%"
+
+# Top CPU processes
+echo "   Top 5 CPU processes:"
+ps aux | sort -nrk 3,3 | head -5 | awk '{printf "   - %s (PID %s): %.1f%%\n", $11, $2, $3}'
+echo ""
+
+# 2. Memory Check
+echo "2. Memory Usage"
+echo "---------------"
+MEM_USAGE=$(free | grep Mem | awk '{print ($3/$2) * 100.0}')
+print_status "Memory" "$MEM_USAGE" "$MEM_WARNING" "$MEM_CRITICAL" "%"
+
+# Memory details
+free -h | grep -E "Mem|Swap" | awk '{printf "   %s: %s used / %s total\n", $1, $3, $2}'
+
+# Top memory processes
+echo "   Top 5 memory processes:"
+ps aux | sort -nrk 4,4 | head -5 | awk '{printf "   - %s (PID %s): %.1f%%\n", $11, $2, $4}'
+echo ""
+
+# 3. Disk Check
+echo "3. Disk Usage"
+echo "-------------"
+df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | while read line; do
+    DISK=$(echo $line | awk '{print $1}')
+    MOUNT=$(echo $line | awk '{print $6}')
+    USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
+
+    print_status "$MOUNT" "$USAGE" "$DISK_WARNING" "$DISK_CRITICAL" "%"
+done
+
+# Disk I/O
+echo "   Disk I/O:"
+if command -v iostat &> /dev/null; then
+    iostat -x 1 2 | tail -n +4 | awk 'NR>1 {printf "   %s: %.1f%% utilization\n", $1, $NF}'
+else
+    echo "   (iostat not installed)"
+fi
+echo ""
+
+# 4. Network Check
+echo "4. Network"
+echo "----------"
+
+# Check connectivity
+if ping -c 1 -W 2 8.8.8.8 &> /dev/null; then
+    echo -e "${GREEN}✓ Internet connectivity: OK${NC}"
+else
+    echo -e "${RED}✗ Internet connectivity: FAILED${NC}"
+fi
+
+# DNS check
+if nslookup google.com &> /dev/null; then
+    echo -e "${GREEN}✓ DNS resolution: OK${NC}"
+else
+    echo -e "${RED}✗ DNS resolution: FAILED${NC}"
+fi
+
+# Connection count
+CONN_COUNT=$(netstat -an 2>/dev/null | grep ESTABLISHED | wc -l)
+echo "   Active connections: $CONN_COUNT"
+echo ""
+
+# 5. Database Check (if PostgreSQL installed)
+echo "5. Database (PostgreSQL)"
+echo "------------------------"
+if command -v psql &> /dev/null; then
+    # Try to connect
+    if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
+        echo -e "${GREEN}✓ PostgreSQL: Running${NC}"
+
+        # Connection count
+        CONN=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;")
+        MAX_CONN=$(sudo -u postgres psql -t -c "SHOW max_connections;")
+        CONN_PCT=$(echo "scale=1; $CONN / $MAX_CONN * 100" | bc)
+        print_status "Connections" "$CONN_PCT" "80" "90" "% ($CONN/$MAX_CONN)"
+
+        # Database size
+        echo "   Database sizes:"
+        sudo -u postgres psql -t -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;" | head -5 | awk '{printf "   - %s: %s\n", $1, $3}'
+    else
+        echo -e "${RED}✗ PostgreSQL: Not accessible${NC}"
+    fi
+else
+    echo "   PostgreSQL not installed"
+fi
+echo ""
+
+# 6. Services Check
+echo "6. Services"
+echo "-----------"
+
+# List of services to check (customize as needed)
+SERVICES=("nginx" "postgresql" "redis-server")
+
+for service in "${SERVICES[@]}"; do
+    if systemctl is-active --quiet $service 2>/dev/null; then
+        echo -e "${GREEN}✓ $service: Running${NC}"
+    else
+        if systemctl list-unit-files | grep -q "^$service"; then
+            echo -e "${RED}✗ $service: Stopped${NC}"
+        else
+            echo "   $service: Not installed"
+        fi
+    fi
+done
+echo ""
+
+# 7. API Response Time (if applicable)
+echo "7. API Health"
+echo "-------------"
+
+# Check localhost health endpoint
+if command -v curl &> /dev/null; then
+    HEALTH_URL="http://localhost/health"
+
+    # Time the request
+    RESPONSE=$(curl -s -w "\n%{http_code}\n%{time_total}" -o /dev/null $HEALTH_URL 2>/dev/null)
+    HTTP_CODE=$(echo "$RESPONSE" | sed -n '1p')
+    TIME=$(echo "$RESPONSE" | sed -n '2p')
+
+    if [ "$HTTP_CODE" = "200" ]; then
+        TIME_MS=$(echo "$TIME * 1000" | bc)
+        echo -e "${GREEN}✓ Health endpoint: Responding (${TIME_MS}ms)${NC}"
+    else
+        echo -e "${RED}✗ Health endpoint: Failed (HTTP $HTTP_CODE)${NC}"
+    fi
+else
+    echo "   curl not installed"
+fi
+echo ""
+
+# 8. Load Average
+echo "8. Load Average"
+echo "---------------"
+LOAD=$(uptime | awk -F'load average:' '{ print $2 }')
+CORES=$(nproc)
+echo "   Load: $LOAD"
+echo "   CPU cores: $CORES"
+LOAD_1MIN=$(echo $LOAD | awk -F', ' '{print $1}' | xargs)
+LOAD_PER_CORE=$(echo "scale=2; $LOAD_1MIN / $CORES" | bc)
+
+if (( $(echo "$LOAD_PER_CORE >= 2.0" | bc -l) )); then
+    echo -e "${RED}✗ Load per core: ${LOAD_PER_CORE} (HIGH)${NC}"
+elif (( $(echo "$LOAD_PER_CORE >= 1.0" | bc -l) )); then
+    echo -e "${YELLOW}⚠ Load per core: ${LOAD_PER_CORE} (ELEVATED)${NC}"
+else
+    echo -e "${GREEN}✓ Load per core: ${LOAD_PER_CORE} (OK)${NC}"
+fi
+echo ""
+
+# 9. Recent Errors
+echo "9. Recent Errors (last 10 minutes)"
+echo "-----------------------------------"
+if [ -f /var/log/syslog ]; then
+    ERROR_COUNT=$(grep -c "error\|Error\|ERROR" /var/log/syslog 2>/dev/null | tail -1000 || echo 0)
+    echo "   Syslog errors: $ERROR_COUNT"
+fi
+
+# Check journal if systemd
+if command -v journalctl &> /dev/null; then
+    JOURNAL_ERRORS=$(journalctl --since "10 minutes ago" --priority=err --no-pager | wc -l)
+    echo "   Journalctl errors: $JOURNAL_ERRORS"
+fi
+echo ""
+
+# Summary
+echo "========================================="
+echo "SUMMARY"
+echo "========================================="
+echo "Health check completed at $(date)"
+echo ""
+echo "Next steps:"
+echo "- If any CRITICAL issues, investigate immediately"
+echo "- If WARNING issues, monitor and plan mitigation"
+echo "- Review playbooks: ../playbooks/"
+echo ""
--- a/agents/sre/scripts/log-analyzer.py
+++ b/agents/sre/scripts/log-analyzer.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+
+"""
+log-analyzer.py
+Parse application/system logs for error patterns and anomalies
+
+Usage: python3 log-analyzer.py /var/log/application.log
+       python3 log-analyzer.py /var/log/application.log --errors-only
+       python3 log-analyzer.py /var/log/application.log --since "2025-10-26 14:00"
+"""
+
+import re
+import sys
+import argparse
+from datetime import datetime, timedelta
+from collections import Counter, defaultdict
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze log files for errors and patterns')
+    parser.add_argument('logfile', help='Path to log file')
+    parser.add_argument('--errors-only', action='store_true', help='Show only errors (ERROR, FATAL)')
+    parser.add_argument('--warnings', action='store_true', help='Include warnings')
+    parser.add_argument('--since', help='Show logs since timestamp (YYYY-MM-DD HH:MM)')
+    parser.add_argument('--until', help='Show logs until timestamp (YYYY-MM-DD HH:MM)')
+    parser.add_argument('--pattern', help='Search for specific pattern (regex)')
+    parser.add_argument('--top', type=int, default=10, help='Show top N errors (default: 10)')
+    return parser.parse_args()
+
+def parse_log_line(line):
+    """Parse common log formats"""
+    # Try different log formats
+    patterns = [
+        # JSON: {"timestamp":"2025-10-26T14:00:00Z","level":"ERROR","message":"..."}
+        r'\{"timestamp":"(?P<timestamp>[^"]+)".*"level":"(?P<level>[^"]+)".*"message":"(?P<message>[^"]+)"',
+
+        # Standard: [2025-10-26 14:00:00] ERROR: message
+        r'\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]\s+(?P<level>\w+):\s+(?P<message>.*)',
+
+        # Syslog: Oct 26 14:00:00 hostname application[1234]: ERROR message
+        r'(?P<timestamp>\w+ \d+ \d{2}:\d{2}:\d{2})\s+\S+\s+\S+:\s+(?P<level>\w+)\s+(?P<message>.*)',
+
+        # Simple: 2025-10-26 14:00:00 ERROR message
+        r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?P<level>\w+)\s+(?P<message>.*)',
+    ]
+
+    for pattern in patterns:
+        match = re.match(pattern, line)
+        if match:
+            return match.groupdict()
+
+    # If no pattern matched, return raw line
+    return {'timestamp': None, 'level': 'INFO', 'message': line.strip()}
+
+def parse_timestamp(ts_str):
+    """Parse various timestamp formats"""
+    if not ts_str:
+        return None
+
+    formats = [
+        '%Y-%m-%dT%H:%M:%SZ',
+        '%Y-%m-%d %H:%M:%S',
+        '%b %d %H:%M:%S',
+    ]
+
+    for fmt in formats:
+        try:
+            return datetime.strptime(ts_str, fmt)
+        except ValueError:
+            continue
+
+    return None
+
+def main():
+    args = parse_args()
+
+    # Parse filters
+    since = datetime.strptime(args.since, '%Y-%m-%d %H:%M') if args.since else None
+    until = datetime.strptime(args.until, '%Y-%m-%d %H:%M') if args.until else None
+
+    # Stats
+    total_lines = 0
+    error_count = 0
+    warning_count = 0
+    error_messages = Counter()
+    errors_by_hour = defaultdict(int)
+    error_timeline = []
+
+    print(f"Analyzing log file: {args.logfile}")
+    print("=" * 80)
+    print()
+
+    try:
+        with open(args.logfile, 'r', encoding='utf-8', errors='ignore') as f:
+            for line in f:
+                total_lines += 1
+
+                # Parse log line
+                parsed = parse_log_line(line)
+                level = parsed.get('level', '').upper()
+                message = parsed.get('message', '')
+                timestamp = parse_timestamp(parsed.get('timestamp'))
+
+                # Filter by time range
+                if since and timestamp and timestamp < since:
+                    continue
+                if until and timestamp and timestamp > until:
+                    continue
+
+                # Filter by pattern
+                if args.pattern and not re.search(args.pattern, message, re.IGNORECASE):
+                    continue
+
+                # Filter by level
+                if args.errors_only and level not in ['ERROR', 'FATAL', 'CRITICAL']:
+                    continue
+
+                # Count errors and warnings
+                if level in ['ERROR', 'FATAL', 'CRITICAL']:
+                    error_count += 1
+
+                    # Extract error message (first 100 chars)
+                    error_key = message[:100] if len(message) > 100 else message
+                    error_messages[error_key] += 1
+
+                    # Group by hour
+                    if timestamp:
+                        hour_key = timestamp.strftime('%Y-%m-%d %H:00')
+                        errors_by_hour[hour_key] += 1
+                        error_timeline.append((timestamp, message))
+
+                elif level in ['WARN', 'WARNING'] and args.warnings:
+                    warning_count += 1
+
+        # Print summary
+        print(f"📊 SUMMARY")
+        print(f"---------")
+        print(f"Total lines: {total_lines:,}")
+        print(f"Errors: {error_count:,}")
+        if args.warnings:
+            print(f"Warnings: {warning_count:,}")
+        print()
+
+        # Top errors
+        if error_messages:
+            print(f"🔥 TOP {args.top} ERRORS")
+            print(f"{'Count':<10} {'Message':<70}")
+            print("-" * 80)
+            for msg, count in error_messages.most_common(args.top):
+                msg_short = (msg[:67] + '...') if len(msg) > 70 else msg
+                print(f"{count:<10} {msg_short}")
+            print()
+
+        # Errors by hour
+        if errors_by_hour:
+            print(f"📈 ERRORS BY HOUR")
+            print(f"{'Hour':<20} {'Count':<10} {'Graph':<50}")
+            print("-" * 80)
+
+            max_errors = max(errors_by_hour.values())
+            for hour in sorted(errors_by_hour.keys()):
+                count = errors_by_hour[hour]
+                bar_length = int((count / max_errors) * 40)
+                bar = '█' * bar_length
+                print(f"{hour:<20} {count:<10} {bar}")
+            print()
+
+        # Error timeline (last 20)
+        if error_timeline:
+            print(f"⏱️  ERROR TIMELINE (Last 20)")
+            print(f"{'Timestamp':<20} {'Message':<60}")
+            print("-" * 80)
+
+            for timestamp, message in sorted(error_timeline, reverse=True)[:20]:
+                ts_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
+                msg_short = (message[:57] + '...') if len(message) > 60 else message
+                print(f"{ts_str:<20} {msg_short}")
+            print()
+
+        # Recommendations
+        print(f"💡 RECOMMENDATIONS")
+        print(f"-----------------")
+
+        if error_count == 0:
+            print("✅ No errors found. System looks healthy!")
+        elif error_count < 10:
+            print(f"⚠️  {error_count} errors found. Review above for details.")
+        elif error_count < 100:
+            print(f"⚠️  {error_count} errors found. Investigate top errors.")
+        else:
+            print(f"🚨 {error_count} errors found! Immediate investigation required.")
+            print("   - Check for cascading failures")
+            print("   - Review error timeline for spike")
+            print("   - Check related services")
+
+        if errors_by_hour:
+            # Find hour with most errors
+            peak_hour = max(errors_by_hour.items(), key=lambda x: x[1])
+            print(f"\n📍 Peak error hour: {peak_hour[0]} ({peak_hour[1]} errors)")
+            print(f"   - Review what happened at this time")
+            print(f"   - Check deployment, traffic spike, external dependency")
+
+        print()
+
+    except FileNotFoundError:
+        print(f"❌ Error: Log file not found: {args.logfile}")
+        sys.exit(1)
+    except PermissionError:
+        print(f"❌ Error: Permission denied: {args.logfile}")
+        print(f"   Try: sudo python3 {sys.argv[0]} {args.logfile}")
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- a/agents/sre/scripts/metrics-collector.sh
+++ b/agents/sre/scripts/metrics-collector.sh
@@ -0,0 +1,294 @@
+#!/bin/bash
+
+# metrics-collector.sh
+# Gather system metrics for incident diagnosis
+# Usage: ./metrics-collector.sh [output_file]
+
+set -e
+
+OUTPUT_FILE=${1:-"metrics-$(date +%Y%m%d-%H%M%S).txt"}
+
+echo "Collecting system metrics..."
+echo "Output: $OUTPUT_FILE"
+echo ""
+
+{
+    echo "========================================="
+    echo "SYSTEM METRICS COLLECTION"
+    echo "========================================="
+    echo "Date: $(date)"
+    echo "Hostname: $(hostname)"
+    echo "Uptime: $(uptime -p 2>/dev/null || uptime)"
+    echo ""
+
+    # 1. CPU Metrics
+    echo "========================================="
+    echo "1. CPU METRICS"
+    echo "========================================="
+    echo ""
+
+    echo "CPU Info:"
+    lscpu | grep -E "^Model name|^CPU\(s\)|^Thread|^Core|^Socket"
+    echo ""
+
+    echo "CPU Usage (snapshot):"
+    top -bn1 | head -20
+    echo ""
+
+    echo "Load Average:"
+    uptime
+    echo ""
+
+    if command -v mpstat &> /dev/null; then
+        echo "CPU by Core:"
+        mpstat -P ALL 1 1
+        echo ""
+    fi
+
+    # 2. Memory Metrics
+    echo "========================================="
+    echo "2. MEMORY METRICS"
+    echo "========================================="
+    echo ""
+
+    echo "Memory Overview:"
+    free -h
+    echo ""
+
+    echo "Memory Details:"
+    cat /proc/meminfo | head -20
+    echo ""
+
+    echo "Top Memory Processes:"
+    ps aux | sort -nrk 4,4 | head -10
+    echo ""
+
+    # 3. Disk Metrics
+    echo "========================================="
+    echo "3. DISK METRICS"
+    echo "========================================="
+    echo ""
+
+    echo "Disk Usage:"
+    df -h
+    echo ""
+
+    echo "Inode Usage:"
+    df -i
+    echo ""
+
+    if command -v iostat &> /dev/null; then
+        echo "Disk I/O Stats:"
+        iostat -x 1 5
+        echo ""
+    fi
+
+    echo "Disk Space by Directory (/):"
+    du -sh /* 2>/dev/null | sort -hr | head -20
+    echo ""
+
+    # 4. Network Metrics
+    echo "========================================="
+    echo "4. NETWORK METRICS"
+    echo "========================================="
+    echo ""
+
+    echo "Network Interfaces:"
+    ip addr show
+    echo ""
+
+    echo "Network Statistics:"
+    netstat -s | head -50
+    echo ""
+
+    echo "Active Connections:"
+    netstat -an | grep ESTABLISHED | wc -l
+    echo ""
+
+    echo "Top 10 IPs by Connection Count:"
+    netstat -ntu | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10
+    echo ""
+
+    if command -v ss &> /dev/null; then
+        echo "Socket Stats:"
+        ss -s
+        echo ""
+    fi
+
+    # 5. Process Metrics
+    echo "========================================="
+    echo "5. PROCESS METRICS"
+    echo "========================================="
+    echo ""
+
+    echo "Process Count:"
+    ps aux | wc -l
+    echo ""
+
+    echo "Top CPU Processes:"
+    ps aux | sort -nrk 3,3 | head -10
+    echo ""
+
+    echo "Top Memory Processes:"
+    ps aux | sort -nrk 4,4 | head -10
+    echo ""
+
+    echo "Zombie Processes:"
+    ps aux | grep -E "<defunct>|Z" | grep -v grep
+    echo ""
+
+    # 6. Database Metrics (PostgreSQL)
+    echo "========================================="
+    echo "6. DATABASE METRICS (PostgreSQL)"
+    echo "========================================="
+    echo ""
+
+    if command -v psql &> /dev/null; then
+        if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
+            echo "PostgreSQL Connection Count:"
+            sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;"
+            echo ""
+
+            echo "PostgreSQL Max Connections:"
+            sudo -u postgres psql -t -c "SHOW max_connections;"
+            echo ""
+
+            echo "PostgreSQL Active Queries:"
+            sudo -u postgres psql -x -c "SELECT pid, usename, application_name, state, query FROM pg_stat_activity WHERE state != 'idle' LIMIT 10;"
+            echo ""
+
+            echo "PostgreSQL Database Sizes:"
+            sudo -u postgres psql -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;"
+            echo ""
+
+            echo "PostgreSQL Table Sizes (top 10):"
+            sudo -u postgres psql -c "SELECT schemaname, tablename, pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size FROM pg_tables ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC LIMIT 10;"
+            echo ""
+
+            if command -v pg_stat_statements &> /dev/null; then
+                echo "PostgreSQL Slow Queries (top 5):"
+                sudo -u postgres psql -c "SELECT query, calls, total_exec_time, mean_exec_time FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 5;"
+                echo ""
+            fi
+        else
+            echo "PostgreSQL not accessible"
+            echo ""
+        fi
+    else
+        echo "PostgreSQL not installed"
+        echo ""
+    fi
+
+    # 7. Web Server Metrics (nginx)
+    echo "========================================="
+    echo "7. WEB SERVER METRICS (nginx)"
+    echo "========================================="
+    echo ""
+
+    if systemctl is-active --quiet nginx 2>/dev/null; then
+        echo "Nginx Status: Running"
+
+        if [ -f /var/log/nginx/access.log ]; then
+            echo ""
+            echo "Nginx Request Count (last 1000 lines):"
+            tail -1000 /var/log/nginx/access.log | wc -l
+
+            echo ""
+            echo "Nginx Status Codes (last 1000 lines):"
+            tail -1000 /var/log/nginx/access.log | awk '{print $9}' | sort | uniq -c | sort -nr
+
+            echo ""
+            echo "Nginx Top 10 URLs:"
+            tail -1000 /var/log/nginx/access.log | awk '{print $7}' | sort | uniq -c | sort -nr | head -10
+
+            echo ""
+            echo "Nginx Top 10 IPs:"
+            tail -1000 /var/log/nginx/access.log | awk '{print $1}' | sort | uniq -c | sort -nr | head -10
+        fi
+    else
+        echo "Nginx not running"
+    fi
+    echo ""
+
+    # 8. Application Metrics (customize as needed)
+    echo "========================================="
+    echo "8. APPLICATION METRICS"
+    echo "========================================="
+    echo ""
+
+    echo "Application Processes:"
+    ps aux | grep -E "node|java|python|ruby" | grep -v grep
+    echo ""
+
+    echo "Application Ports:"
+    netstat -tlnp 2>/dev/null | grep -E "node|java|python|ruby"
+    echo ""
+
+    # 9. System Logs (recent errors)
+    echo "========================================="
+    echo "9. RECENT SYSTEM ERRORS"
+    echo "========================================="
+    echo ""
+
+    echo "Recent Syslog Errors (last 50):"
+    if [ -f /var/log/syslog ]; then
+        grep -i "error\|fail\|critical" /var/log/syslog | tail -50
+    else
+        echo "Syslog not found"
+    fi
+    echo ""
+
+    echo "Recent Journal Errors (last 10 minutes):"
+    if command -v journalctl &> /dev/null; then
+        journalctl --since "10 minutes ago" --priority=err --no-pager | tail -50
+    else
+        echo "journalctl not available"
+    fi
+    echo ""
+
+    # 10. System Info
+    echo "========================================="
+    echo "10. SYSTEM INFORMATION"
+    echo "========================================="
+    echo ""
+
+    echo "OS Version:"
+    cat /etc/os-release 2>/dev/null || uname -a
+    echo ""
+
+    echo "Kernel Version:"
+    uname -r
+    echo ""
+
+    echo "System Time:"
+    date
+    echo ""
+
+    echo "Timezone:"
+    timedatectl 2>/dev/null || cat /etc/timezone
+    echo ""
+
+    # Summary
+    echo "========================================="
+    echo "COLLECTION COMPLETE"
+    echo "========================================="
+    echo "Collected at: $(date)"
+    echo "Metrics saved to: $OUTPUT_FILE"
+    echo ""
+
+} > "$OUTPUT_FILE" 2>&1
+
+# Print summary to console
+echo ""
+echo "✅ Metrics collection complete!"
+echo ""
+echo "Summary:"
+grep -E "CPU Usage|Memory Overview|Disk Usage|Active Connections|PostgreSQL Connection Count" "$OUTPUT_FILE" | head -20
+echo ""
+echo "Full report: $OUTPUT_FILE"
+echo ""
+echo "Next steps:"
+echo "  - Review metrics for anomalies"
+echo "  - Compare with baseline metrics"
+echo "  - Share with team for analysis"
+echo ""
--- a/agents/sre/scripts/trace-analyzer.js
+++ b/agents/sre/scripts/trace-analyzer.js
@@ -0,0 +1,257 @@
+#!/usr/bin/env node
+
+/**
+ * trace-analyzer.js
+ * Analyze distributed tracing data to identify bottlenecks
+ *
+ * Usage: node trace-analyzer.js <trace-id>
+ *        node trace-analyzer.js <trace-id> --format=json
+ *        node trace-analyzer.js --file=trace.json
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+// Parse arguments
+const args = process.argv.slice(2);
+let traceId = null;
+let traceFile = null;
+let outputFormat = 'text'; // text or json
+
+for (const arg of args) {
+  if (arg.startsWith('--file=')) {
+    traceFile = arg.split('=')[1];
+  } else if (arg.startsWith('--format=')) {
+    outputFormat = arg.split('=')[1];
+  } else if (!arg.startsWith('--')) {
+    traceId = arg;
+  }
+}
+
+// Mock trace data (in production, fetch from APM/tracing system)
+function getMockTraceData(id) {
+  return {
+    traceId: id,
+    rootSpan: {
+      spanId: 'span-1',
+      service: 'frontend',
+      operation: 'GET /dashboard',
+      startTime: 1698345600000,
+      duration: 8250, // ms
+      children: [
+        {
+          spanId: 'span-2',
+          service: 'api',
+          operation: 'GET /api/dashboard',
+          startTime: 1698345600010,
+          duration: 8200,
+          children: [
+            {
+              spanId: 'span-3',
+              service: 'api',
+              operation: 'db.query',
+              startTime: 1698345600020,
+              duration: 7800, // SLOW!
+              tags: {
+                'db.statement': 'SELECT * FROM users WHERE last_login_at > ...',
+                'db.type': 'postgresql',
+              },
+              children: [],
+            },
+            {
+              spanId: 'span-4',
+              service: 'api',
+              operation: 'cache.get',
+              startTime: 1698345608200,
+              duration: 5,
+              children: [],
+            },
+          ],
+        },
+      ],
+    },
+  };
+}
+
+// Load trace from file or mock
+function loadTrace() {
+  if (traceFile) {
+    try {
+      const data = fs.readFileSync(traceFile, 'utf8');
+      return JSON.parse(data);
+    } catch (error) {
+      console.error(`❌ Error loading trace file: ${error.message}`);
+      process.exit(1);
+    }
+  } else if (traceId) {
+    return getMockTraceData(traceId);
+  } else {
+    console.error('Usage: node trace-analyzer.js <trace-id> OR --file=trace.json');
+    process.exit(1);
+  }
+}
+
+// Analyze trace
+function analyzeTrace(trace) {
+  const analysis = {
+    traceId: trace.traceId,
+    totalDuration: trace.rootSpan.duration,
+    rootOperation: trace.rootSpan.operation,
+    spanCount: 0,
+    slowSpans: [],
+    bottlenecks: [],
+    serviceBreakdown: {},
+  };
+
+  // Traverse spans
+  function traverseSpans(span, depth = 0) {
+    analysis.spanCount++;
+
+    // Track service time
+    if (!analysis.serviceBreakdown[span.service]) {
+      analysis.serviceBreakdown[span.service] = {
+        totalTime: 0,
+        calls: 0,
+      };
+    }
+    analysis.serviceBreakdown[span.service].totalTime += span.duration;
+    analysis.serviceBreakdown[span.service].calls++;
+
+    // Identify slow spans (>1s)
+    if (span.duration > 1000) {
+      analysis.slowSpans.push({
+        service: span.service,
+        operation: span.operation,
+        duration: span.duration,
+        percentage: ((span.duration / analysis.totalDuration) * 100).toFixed(1),
+        depth,
+      });
+    }
+
+    // Traverse children
+    if (span.children) {
+      span.children.forEach(child => traverseSpans(child, depth + 1));
+    }
+  }
+
+  traverseSpans(trace.rootSpan);
+
+  // Sort slow spans by duration
+  analysis.slowSpans.sort((a, b) => b.duration - a.duration);
+
+  // Identify bottlenecks (spans taking >50% of total time)
+  analysis.bottlenecks = analysis.slowSpans.filter(
+    span => parseFloat(span.percentage) > 50
+  );
+
+  return analysis;
+}
+
+// Format duration
+function formatDuration(ms) {
+  if (ms < 1000) return `${ms}ms`;
+  return `${(ms / 1000).toFixed(2)}s`;
+}
+
+// Print analysis (text format)
+function printAnalysis(analysis) {
+  console.log('========================================');
+  console.log('DISTRIBUTED TRACE ANALYSIS');
+  console.log('========================================');
+  console.log(`Trace ID: ${analysis.traceId}`);
+  console.log(`Root Operation: ${analysis.rootOperation}`);
+  console.log(`Total Duration: ${formatDuration(analysis.totalDuration)}`);
+  console.log(`Total Spans: ${analysis.spanCount}`);
+  console.log('');
+
+  // Service breakdown
+  console.log('📊 SERVICE BREAKDOWN');
+  console.log('-------------------');
+  console.log(`${'Service'.padEnd(20)} ${'Time'.padEnd(15)} ${'Calls'.padEnd(10)} ${'% of Total'.padEnd(15)}`);
+  console.log('-'.repeat(70));
+
+  for (const [service, data] of Object.entries(analysis.serviceBreakdown)) {
+    const percentage = ((data.totalTime / analysis.totalDuration) * 100).toFixed(1);
+    console.log(
+      `${service.padEnd(20)} ${formatDuration(data.totalTime).padEnd(15)} ${String(data.calls).padEnd(10)} ${percentage}%`
+    );
+  }
+  console.log('');
+
+  // Slow spans
+  if (analysis.slowSpans.length > 0) {
+    console.log(`🐌 SLOW SPANS (>${formatDuration(1000)})`);
+    console.log('-------------------');
+    console.log(`${'Service'.padEnd(15)} ${'Operation'.padEnd(30)} ${'Duration'.padEnd(15)} ${'% of Total'.padEnd(15)}`);
+    console.log('-'.repeat(80));
+
+    for (const span of analysis.slowSpans.slice(0, 10)) {
+      console.log(
+        `${span.service.padEnd(15)} ${span.operation.padEnd(30)} ${formatDuration(span.duration).padEnd(15)} ${span.percentage}%`
+      );
+    }
+    console.log('');
+  }
+
+  // Bottlenecks
+  if (analysis.bottlenecks.length > 0) {
+    console.log('🚨 BOTTLENECKS (>50% of total time)');
+    console.log('-----------------------------------');
+
+    for (const bottleneck of analysis.bottlenecks) {
+      console.log(`⚠️  ${bottleneck.service} - ${bottleneck.operation}`);
+      console.log(`   Duration: ${formatDuration(bottleneck.duration)} (${bottleneck.percentage}% of trace)`);
+      console.log('');
+    }
+  }
+
+  // Recommendations
+  console.log('💡 RECOMMENDATIONS');
+  console.log('-----------------');
+
+  if (analysis.bottlenecks.length > 0) {
+    console.log('🔴 CRITICAL: Bottlenecks detected!');
+    for (const bottleneck of analysis.bottlenecks) {
+      console.log(`   - Optimize ${bottleneck.service}.${bottleneck.operation} (${bottleneck.percentage}% of trace)`);
+
+      // Specific recommendations based on operation
+      if (bottleneck.operation.includes('db.query')) {
+        console.log('     → Add database index, optimize query, add caching');
+      } else if (bottleneck.operation.includes('http')) {
+        console.log('     → Add timeout, cache response, use async processing');
+      } else if (bottleneck.operation.includes('cache')) {
+        console.log('     → Check cache hit rate, optimize cache key');
+      }
+    }
+  } else if (analysis.slowSpans.length > 0) {
+    console.log('🟡 Some slow spans detected:');
+    for (const span of analysis.slowSpans.slice(0, 3)) {
+      console.log(`   - ${span.service}.${span.operation}: ${formatDuration(span.duration)}`);
+    }
+  } else {
+    console.log('✅ No obvious performance issues detected.');
+    console.log('   All spans complete in reasonable time.');
+  }
+
+  console.log('');
+  console.log('Next steps:');
+  console.log('  - Profile slowest spans');
+  console.log('  - Check for N+1 queries, missing indexes');
+  console.log('  - Add caching where appropriate');
+  console.log('  - Review external API timeouts');
+  console.log('');
+}
+
+// Main
+function main() {
+  const trace = loadTrace();
+  const analysis = analyzeTrace(trace);
+
+  if (outputFormat === 'json') {
+    console.log(JSON.stringify(analysis, null, 2));
+  } else {
+    printAnalysis(analysis);
+  }
+}
+
+main();