Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:56:41 +08:00
commit 9427ed1eea
40 changed files with 15189 additions and 0 deletions

View File

@@ -0,0 +1,230 @@
#!/bin/bash
# health-check.sh
# Quick system health check across all layers
# Usage: ./health-check.sh
set -e
echo "========================================="
echo "SYSTEM HEALTH CHECK"
echo "========================================="
echo "Date: $(date)"
echo ""
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Thresholds
CPU_WARNING=70
CPU_CRITICAL=90
MEM_WARNING=80
MEM_CRITICAL=90
DISK_WARNING=80
DISK_CRITICAL=90
# Helper function for status
print_status() {
local metric=$1
local value=$2
local warning=$3
local critical=$4
local unit=$5
if (( $(echo "$value >= $critical" | bc -l) )); then
echo -e "${RED}$metric: ${value}${unit} (CRITICAL)${NC}"
return 2
elif (( $(echo "$value >= $warning" | bc -l) )); then
echo -e "${YELLOW}$metric: ${value}${unit} (WARNING)${NC}"
return 1
else
echo -e "${GREEN}$metric: ${value}${unit} (OK)${NC}"
return 0
fi
}
# 1. CPU Check
echo "1. CPU Usage"
echo "-------------"
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
print_status "CPU" "$CPU_USAGE" "$CPU_WARNING" "$CPU_CRITICAL" "%"
# Top CPU processes
echo " Top 5 CPU processes:"
ps aux | sort -nrk 3,3 | head -5 | awk '{printf " - %s (PID %s): %.1f%%\n", $11, $2, $3}'
echo ""
# 2. Memory Check
echo "2. Memory Usage"
echo "---------------"
MEM_USAGE=$(free | grep Mem | awk '{print ($3/$2) * 100.0}')
print_status "Memory" "$MEM_USAGE" "$MEM_WARNING" "$MEM_CRITICAL" "%"
# Memory details
free -h | grep -E "Mem|Swap" | awk '{printf " %s: %s used / %s total\n", $1, $3, $2}'
# Top memory processes
echo " Top 5 memory processes:"
ps aux | sort -nrk 4,4 | head -5 | awk '{printf " - %s (PID %s): %.1f%%\n", $11, $2, $4}'
echo ""
# 3. Disk Check
echo "3. Disk Usage"
echo "-------------"
df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | while read line; do
DISK=$(echo $line | awk '{print $1}')
MOUNT=$(echo $line | awk '{print $6}')
USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
print_status "$MOUNT" "$USAGE" "$DISK_WARNING" "$DISK_CRITICAL" "%"
done
# Disk I/O
echo " Disk I/O:"
if command -v iostat &> /dev/null; then
iostat -x 1 2 | tail -n +4 | awk 'NR>1 {printf " %s: %.1f%% utilization\n", $1, $NF}'
else
echo " (iostat not installed)"
fi
echo ""
# 4. Network Check
echo "4. Network"
echo "----------"
# Check connectivity
if ping -c 1 -W 2 8.8.8.8 &> /dev/null; then
echo -e "${GREEN}✓ Internet connectivity: OK${NC}"
else
echo -e "${RED}✗ Internet connectivity: FAILED${NC}"
fi
# DNS check
if nslookup google.com &> /dev/null; then
echo -e "${GREEN}✓ DNS resolution: OK${NC}"
else
echo -e "${RED}✗ DNS resolution: FAILED${NC}"
fi
# Connection count
CONN_COUNT=$(netstat -an 2>/dev/null | grep ESTABLISHED | wc -l)
echo " Active connections: $CONN_COUNT"
echo ""
# 5. Database Check (if PostgreSQL installed)
echo "5. Database (PostgreSQL)"
echo "------------------------"
if command -v psql &> /dev/null; then
# Try to connect
if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
echo -e "${GREEN}✓ PostgreSQL: Running${NC}"
# Connection count
CONN=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;")
MAX_CONN=$(sudo -u postgres psql -t -c "SHOW max_connections;")
CONN_PCT=$(echo "scale=1; $CONN / $MAX_CONN * 100" | bc)
print_status "Connections" "$CONN_PCT" "80" "90" "% ($CONN/$MAX_CONN)"
# Database size
echo " Database sizes:"
sudo -u postgres psql -t -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;" | head -5 | awk '{printf " - %s: %s\n", $1, $3}'
else
echo -e "${RED}✗ PostgreSQL: Not accessible${NC}"
fi
else
echo " PostgreSQL not installed"
fi
echo ""
# 6. Services Check
echo "6. Services"
echo "-----------"
# List of services to check (customize as needed)
SERVICES=("nginx" "postgresql" "redis-server")
for service in "${SERVICES[@]}"; do
if systemctl is-active --quiet $service 2>/dev/null; then
echo -e "${GREEN}$service: Running${NC}"
else
if systemctl list-unit-files | grep -q "^$service"; then
echo -e "${RED}$service: Stopped${NC}"
else
echo " $service: Not installed"
fi
fi
done
echo ""
# 7. API Response Time (if applicable)
echo "7. API Health"
echo "-------------"
# Check localhost health endpoint
if command -v curl &> /dev/null; then
HEALTH_URL="http://localhost/health"
# Time the request
RESPONSE=$(curl -s -w "\n%{http_code}\n%{time_total}" -o /dev/null $HEALTH_URL 2>/dev/null)
HTTP_CODE=$(echo "$RESPONSE" | sed -n '1p')
TIME=$(echo "$RESPONSE" | sed -n '2p')
if [ "$HTTP_CODE" = "200" ]; then
TIME_MS=$(echo "$TIME * 1000" | bc)
echo -e "${GREEN}✓ Health endpoint: Responding (${TIME_MS}ms)${NC}"
else
echo -e "${RED}✗ Health endpoint: Failed (HTTP $HTTP_CODE)${NC}"
fi
else
echo " curl not installed"
fi
echo ""
# 8. Load Average
echo "8. Load Average"
echo "---------------"
LOAD=$(uptime | awk -F'load average:' '{ print $2 }')
CORES=$(nproc)
echo " Load: $LOAD"
echo " CPU cores: $CORES"
LOAD_1MIN=$(echo $LOAD | awk -F', ' '{print $1}' | xargs)
LOAD_PER_CORE=$(echo "scale=2; $LOAD_1MIN / $CORES" | bc)
if (( $(echo "$LOAD_PER_CORE >= 2.0" | bc -l) )); then
echo -e "${RED}✗ Load per core: ${LOAD_PER_CORE} (HIGH)${NC}"
elif (( $(echo "$LOAD_PER_CORE >= 1.0" | bc -l) )); then
echo -e "${YELLOW}⚠ Load per core: ${LOAD_PER_CORE} (ELEVATED)${NC}"
else
echo -e "${GREEN}✓ Load per core: ${LOAD_PER_CORE} (OK)${NC}"
fi
echo ""
# 9. Recent Errors
echo "9. Recent Errors (last 10 minutes)"
echo "-----------------------------------"
if [ -f /var/log/syslog ]; then
ERROR_COUNT=$(grep -c "error\|Error\|ERROR" /var/log/syslog 2>/dev/null | tail -1000 || echo 0)
echo " Syslog errors: $ERROR_COUNT"
fi
# Check journal if systemd
if command -v journalctl &> /dev/null; then
JOURNAL_ERRORS=$(journalctl --since "10 minutes ago" --priority=err --no-pager | wc -l)
echo " Journalctl errors: $JOURNAL_ERRORS"
fi
echo ""
# Summary
echo "========================================="
echo "SUMMARY"
echo "========================================="
echo "Health check completed at $(date)"
echo ""
echo "Next steps:"
echo "- If any CRITICAL issues, investigate immediately"
echo "- If WARNING issues, monitor and plan mitigation"
echo "- Review playbooks: ../playbooks/"
echo ""

View File

@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
log-analyzer.py
Parse application/system logs for error patterns and anomalies
Usage: python3 log-analyzer.py /var/log/application.log
python3 log-analyzer.py /var/log/application.log --errors-only
python3 log-analyzer.py /var/log/application.log --since "2025-10-26 14:00"
"""
import re
import sys
import argparse
from datetime import datetime, timedelta
from collections import Counter, defaultdict
def parse_args():
parser = argparse.ArgumentParser(description='Analyze log files for errors and patterns')
parser.add_argument('logfile', help='Path to log file')
parser.add_argument('--errors-only', action='store_true', help='Show only errors (ERROR, FATAL)')
parser.add_argument('--warnings', action='store_true', help='Include warnings')
parser.add_argument('--since', help='Show logs since timestamp (YYYY-MM-DD HH:MM)')
parser.add_argument('--until', help='Show logs until timestamp (YYYY-MM-DD HH:MM)')
parser.add_argument('--pattern', help='Search for specific pattern (regex)')
parser.add_argument('--top', type=int, default=10, help='Show top N errors (default: 10)')
return parser.parse_args()
def parse_log_line(line):
"""Parse common log formats"""
# Try different log formats
patterns = [
# JSON: {"timestamp":"2025-10-26T14:00:00Z","level":"ERROR","message":"..."}
r'\{"timestamp":"(?P<timestamp>[^"]+)".*"level":"(?P<level>[^"]+)".*"message":"(?P<message>[^"]+)"',
# Standard: [2025-10-26 14:00:00] ERROR: message
r'\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]\s+(?P<level>\w+):\s+(?P<message>.*)',
# Syslog: Oct 26 14:00:00 hostname application[1234]: ERROR message
r'(?P<timestamp>\w+ \d+ \d{2}:\d{2}:\d{2})\s+\S+\s+\S+:\s+(?P<level>\w+)\s+(?P<message>.*)',
# Simple: 2025-10-26 14:00:00 ERROR message
r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?P<level>\w+)\s+(?P<message>.*)',
]
for pattern in patterns:
match = re.match(pattern, line)
if match:
return match.groupdict()
# If no pattern matched, return raw line
return {'timestamp': None, 'level': 'INFO', 'message': line.strip()}
def parse_timestamp(ts_str):
"""Parse various timestamp formats"""
if not ts_str:
return None
formats = [
'%Y-%m-%dT%H:%M:%SZ',
'%Y-%m-%d %H:%M:%S',
'%b %d %H:%M:%S',
]
for fmt in formats:
try:
return datetime.strptime(ts_str, fmt)
except ValueError:
continue
return None
def main():
args = parse_args()
# Parse filters
since = datetime.strptime(args.since, '%Y-%m-%d %H:%M') if args.since else None
until = datetime.strptime(args.until, '%Y-%m-%d %H:%M') if args.until else None
# Stats
total_lines = 0
error_count = 0
warning_count = 0
error_messages = Counter()
errors_by_hour = defaultdict(int)
error_timeline = []
print(f"Analyzing log file: {args.logfile}")
print("=" * 80)
print()
try:
with open(args.logfile, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
total_lines += 1
# Parse log line
parsed = parse_log_line(line)
level = parsed.get('level', '').upper()
message = parsed.get('message', '')
timestamp = parse_timestamp(parsed.get('timestamp'))
# Filter by time range
if since and timestamp and timestamp < since:
continue
if until and timestamp and timestamp > until:
continue
# Filter by pattern
if args.pattern and not re.search(args.pattern, message, re.IGNORECASE):
continue
# Filter by level
if args.errors_only and level not in ['ERROR', 'FATAL', 'CRITICAL']:
continue
# Count errors and warnings
if level in ['ERROR', 'FATAL', 'CRITICAL']:
error_count += 1
# Extract error message (first 100 chars)
error_key = message[:100] if len(message) > 100 else message
error_messages[error_key] += 1
# Group by hour
if timestamp:
hour_key = timestamp.strftime('%Y-%m-%d %H:00')
errors_by_hour[hour_key] += 1
error_timeline.append((timestamp, message))
elif level in ['WARN', 'WARNING'] and args.warnings:
warning_count += 1
# Print summary
print(f"📊 SUMMARY")
print(f"---------")
print(f"Total lines: {total_lines:,}")
print(f"Errors: {error_count:,}")
if args.warnings:
print(f"Warnings: {warning_count:,}")
print()
# Top errors
if error_messages:
print(f"🔥 TOP {args.top} ERRORS")
print(f"{'Count':<10} {'Message':<70}")
print("-" * 80)
for msg, count in error_messages.most_common(args.top):
msg_short = (msg[:67] + '...') if len(msg) > 70 else msg
print(f"{count:<10} {msg_short}")
print()
# Errors by hour
if errors_by_hour:
print(f"📈 ERRORS BY HOUR")
print(f"{'Hour':<20} {'Count':<10} {'Graph':<50}")
print("-" * 80)
max_errors = max(errors_by_hour.values())
for hour in sorted(errors_by_hour.keys()):
count = errors_by_hour[hour]
bar_length = int((count / max_errors) * 40)
bar = '' * bar_length
print(f"{hour:<20} {count:<10} {bar}")
print()
# Error timeline (last 20)
if error_timeline:
print(f"⏱️ ERROR TIMELINE (Last 20)")
print(f"{'Timestamp':<20} {'Message':<60}")
print("-" * 80)
for timestamp, message in sorted(error_timeline, reverse=True)[:20]:
ts_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
msg_short = (message[:57] + '...') if len(message) > 60 else message
print(f"{ts_str:<20} {msg_short}")
print()
# Recommendations
print(f"💡 RECOMMENDATIONS")
print(f"-----------------")
if error_count == 0:
print("✅ No errors found. System looks healthy!")
elif error_count < 10:
print(f"⚠️ {error_count} errors found. Review above for details.")
elif error_count < 100:
print(f"⚠️ {error_count} errors found. Investigate top errors.")
else:
print(f"🚨 {error_count} errors found! Immediate investigation required.")
print(" - Check for cascading failures")
print(" - Review error timeline for spike")
print(" - Check related services")
if errors_by_hour:
# Find hour with most errors
peak_hour = max(errors_by_hour.items(), key=lambda x: x[1])
print(f"\n📍 Peak error hour: {peak_hour[0]} ({peak_hour[1]} errors)")
print(f" - Review what happened at this time")
print(f" - Check deployment, traffic spike, external dependency")
print()
except FileNotFoundError:
print(f"❌ Error: Log file not found: {args.logfile}")
sys.exit(1)
except PermissionError:
print(f"❌ Error: Permission denied: {args.logfile}")
print(f" Try: sudo python3 {sys.argv[0]} {args.logfile}")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,294 @@
#!/bin/bash
# metrics-collector.sh
# Gather system metrics for incident diagnosis
# Usage: ./metrics-collector.sh [output_file]
set -e
OUTPUT_FILE=${1:-"metrics-$(date +%Y%m%d-%H%M%S).txt"}
echo "Collecting system metrics..."
echo "Output: $OUTPUT_FILE"
echo ""
{
echo "========================================="
echo "SYSTEM METRICS COLLECTION"
echo "========================================="
echo "Date: $(date)"
echo "Hostname: $(hostname)"
echo "Uptime: $(uptime -p 2>/dev/null || uptime)"
echo ""
# 1. CPU Metrics
echo "========================================="
echo "1. CPU METRICS"
echo "========================================="
echo ""
echo "CPU Info:"
lscpu | grep -E "^Model name|^CPU\(s\)|^Thread|^Core|^Socket"
echo ""
echo "CPU Usage (snapshot):"
top -bn1 | head -20
echo ""
echo "Load Average:"
uptime
echo ""
if command -v mpstat &> /dev/null; then
echo "CPU by Core:"
mpstat -P ALL 1 1
echo ""
fi
# 2. Memory Metrics
echo "========================================="
echo "2. MEMORY METRICS"
echo "========================================="
echo ""
echo "Memory Overview:"
free -h
echo ""
echo "Memory Details:"
cat /proc/meminfo | head -20
echo ""
echo "Top Memory Processes:"
ps aux | sort -nrk 4,4 | head -10
echo ""
# 3. Disk Metrics
echo "========================================="
echo "3. DISK METRICS"
echo "========================================="
echo ""
echo "Disk Usage:"
df -h
echo ""
echo "Inode Usage:"
df -i
echo ""
if command -v iostat &> /dev/null; then
echo "Disk I/O Stats:"
iostat -x 1 5
echo ""
fi
echo "Disk Space by Directory (/):"
du -sh /* 2>/dev/null | sort -hr | head -20
echo ""
# 4. Network Metrics
echo "========================================="
echo "4. NETWORK METRICS"
echo "========================================="
echo ""
echo "Network Interfaces:"
ip addr show
echo ""
echo "Network Statistics:"
netstat -s | head -50
echo ""
echo "Active Connections:"
netstat -an | grep ESTABLISHED | wc -l
echo ""
echo "Top 10 IPs by Connection Count:"
netstat -ntu | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10
echo ""
if command -v ss &> /dev/null; then
echo "Socket Stats:"
ss -s
echo ""
fi
# 5. Process Metrics
echo "========================================="
echo "5. PROCESS METRICS"
echo "========================================="
echo ""
echo "Process Count:"
ps aux | wc -l
echo ""
echo "Top CPU Processes:"
ps aux | sort -nrk 3,3 | head -10
echo ""
echo "Top Memory Processes:"
ps aux | sort -nrk 4,4 | head -10
echo ""
echo "Zombie Processes:"
ps aux | grep -E "<defunct>|Z" | grep -v grep
echo ""
# 6. Database Metrics (PostgreSQL)
echo "========================================="
echo "6. DATABASE METRICS (PostgreSQL)"
echo "========================================="
echo ""
if command -v psql &> /dev/null; then
if sudo -u postgres psql -c "SELECT 1" &> /dev/null; then
echo "PostgreSQL Connection Count:"
sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;"
echo ""
echo "PostgreSQL Max Connections:"
sudo -u postgres psql -t -c "SHOW max_connections;"
echo ""
echo "PostgreSQL Active Queries:"
sudo -u postgres psql -x -c "SELECT pid, usename, application_name, state, query FROM pg_stat_activity WHERE state != 'idle' LIMIT 10;"
echo ""
echo "PostgreSQL Database Sizes:"
sudo -u postgres psql -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database WHERE datistemplate = false;"
echo ""
echo "PostgreSQL Table Sizes (top 10):"
sudo -u postgres psql -c "SELECT schemaname, tablename, pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size FROM pg_tables ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC LIMIT 10;"
echo ""
if command -v pg_stat_statements &> /dev/null; then
echo "PostgreSQL Slow Queries (top 5):"
sudo -u postgres psql -c "SELECT query, calls, total_exec_time, mean_exec_time FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 5;"
echo ""
fi
else
echo "PostgreSQL not accessible"
echo ""
fi
else
echo "PostgreSQL not installed"
echo ""
fi
# 7. Web Server Metrics (nginx)
echo "========================================="
echo "7. WEB SERVER METRICS (nginx)"
echo "========================================="
echo ""
if systemctl is-active --quiet nginx 2>/dev/null; then
echo "Nginx Status: Running"
if [ -f /var/log/nginx/access.log ]; then
echo ""
echo "Nginx Request Count (last 1000 lines):"
tail -1000 /var/log/nginx/access.log | wc -l
echo ""
echo "Nginx Status Codes (last 1000 lines):"
tail -1000 /var/log/nginx/access.log | awk '{print $9}' | sort | uniq -c | sort -nr
echo ""
echo "Nginx Top 10 URLs:"
tail -1000 /var/log/nginx/access.log | awk '{print $7}' | sort | uniq -c | sort -nr | head -10
echo ""
echo "Nginx Top 10 IPs:"
tail -1000 /var/log/nginx/access.log | awk '{print $1}' | sort | uniq -c | sort -nr | head -10
fi
else
echo "Nginx not running"
fi
echo ""
# 8. Application Metrics (customize as needed)
echo "========================================="
echo "8. APPLICATION METRICS"
echo "========================================="
echo ""
echo "Application Processes:"
ps aux | grep -E "node|java|python|ruby" | grep -v grep
echo ""
echo "Application Ports:"
netstat -tlnp 2>/dev/null | grep -E "node|java|python|ruby"
echo ""
# 9. System Logs (recent errors)
echo "========================================="
echo "9. RECENT SYSTEM ERRORS"
echo "========================================="
echo ""
echo "Recent Syslog Errors (last 50):"
if [ -f /var/log/syslog ]; then
grep -i "error\|fail\|critical" /var/log/syslog | tail -50
else
echo "Syslog not found"
fi
echo ""
echo "Recent Journal Errors (last 10 minutes):"
if command -v journalctl &> /dev/null; then
journalctl --since "10 minutes ago" --priority=err --no-pager | tail -50
else
echo "journalctl not available"
fi
echo ""
# 10. System Info
echo "========================================="
echo "10. SYSTEM INFORMATION"
echo "========================================="
echo ""
echo "OS Version:"
cat /etc/os-release 2>/dev/null || uname -a
echo ""
echo "Kernel Version:"
uname -r
echo ""
echo "System Time:"
date
echo ""
echo "Timezone:"
timedatectl 2>/dev/null || cat /etc/timezone
echo ""
# Summary
echo "========================================="
echo "COLLECTION COMPLETE"
echo "========================================="
echo "Collected at: $(date)"
echo "Metrics saved to: $OUTPUT_FILE"
echo ""
} > "$OUTPUT_FILE" 2>&1
# Print summary to console
echo ""
echo "✅ Metrics collection complete!"
echo ""
echo "Summary:"
grep -E "CPU Usage|Memory Overview|Disk Usage|Active Connections|PostgreSQL Connection Count" "$OUTPUT_FILE" | head -20
echo ""
echo "Full report: $OUTPUT_FILE"
echo ""
echo "Next steps:"
echo " - Review metrics for anomalies"
echo " - Compare with baseline metrics"
echo " - Share with team for analysis"
echo ""

View File

@@ -0,0 +1,257 @@
#!/usr/bin/env node
/**
* trace-analyzer.js
* Analyze distributed tracing data to identify bottlenecks
*
* Usage: node trace-analyzer.js <trace-id>
* node trace-analyzer.js <trace-id> --format=json
* node trace-analyzer.js --file=trace.json
*/
const fs = require('fs');
const path = require('path');
// Parse arguments
const args = process.argv.slice(2);
let traceId = null;
let traceFile = null;
let outputFormat = 'text'; // text or json
for (const arg of args) {
if (arg.startsWith('--file=')) {
traceFile = arg.split('=')[1];
} else if (arg.startsWith('--format=')) {
outputFormat = arg.split('=')[1];
} else if (!arg.startsWith('--')) {
traceId = arg;
}
}
// Mock trace data (in production, fetch from APM/tracing system)
function getMockTraceData(id) {
return {
traceId: id,
rootSpan: {
spanId: 'span-1',
service: 'frontend',
operation: 'GET /dashboard',
startTime: 1698345600000,
duration: 8250, // ms
children: [
{
spanId: 'span-2',
service: 'api',
operation: 'GET /api/dashboard',
startTime: 1698345600010,
duration: 8200,
children: [
{
spanId: 'span-3',
service: 'api',
operation: 'db.query',
startTime: 1698345600020,
duration: 7800, // SLOW!
tags: {
'db.statement': 'SELECT * FROM users WHERE last_login_at > ...',
'db.type': 'postgresql',
},
children: [],
},
{
spanId: 'span-4',
service: 'api',
operation: 'cache.get',
startTime: 1698345608200,
duration: 5,
children: [],
},
],
},
],
},
};
}
// Load trace from file or mock
function loadTrace() {
if (traceFile) {
try {
const data = fs.readFileSync(traceFile, 'utf8');
return JSON.parse(data);
} catch (error) {
console.error(`❌ Error loading trace file: ${error.message}`);
process.exit(1);
}
} else if (traceId) {
return getMockTraceData(traceId);
} else {
console.error('Usage: node trace-analyzer.js <trace-id> OR --file=trace.json');
process.exit(1);
}
}
// Analyze trace
function analyzeTrace(trace) {
const analysis = {
traceId: trace.traceId,
totalDuration: trace.rootSpan.duration,
rootOperation: trace.rootSpan.operation,
spanCount: 0,
slowSpans: [],
bottlenecks: [],
serviceBreakdown: {},
};
// Traverse spans
function traverseSpans(span, depth = 0) {
analysis.spanCount++;
// Track service time
if (!analysis.serviceBreakdown[span.service]) {
analysis.serviceBreakdown[span.service] = {
totalTime: 0,
calls: 0,
};
}
analysis.serviceBreakdown[span.service].totalTime += span.duration;
analysis.serviceBreakdown[span.service].calls++;
// Identify slow spans (>1s)
if (span.duration > 1000) {
analysis.slowSpans.push({
service: span.service,
operation: span.operation,
duration: span.duration,
percentage: ((span.duration / analysis.totalDuration) * 100).toFixed(1),
depth,
});
}
// Traverse children
if (span.children) {
span.children.forEach(child => traverseSpans(child, depth + 1));
}
}
traverseSpans(trace.rootSpan);
// Sort slow spans by duration
analysis.slowSpans.sort((a, b) => b.duration - a.duration);
// Identify bottlenecks (spans taking >50% of total time)
analysis.bottlenecks = analysis.slowSpans.filter(
span => parseFloat(span.percentage) > 50
);
return analysis;
}
// Format duration
function formatDuration(ms) {
if (ms < 1000) return `${ms}ms`;
return `${(ms / 1000).toFixed(2)}s`;
}
// Print analysis (text format)
function printAnalysis(analysis) {
console.log('========================================');
console.log('DISTRIBUTED TRACE ANALYSIS');
console.log('========================================');
console.log(`Trace ID: ${analysis.traceId}`);
console.log(`Root Operation: ${analysis.rootOperation}`);
console.log(`Total Duration: ${formatDuration(analysis.totalDuration)}`);
console.log(`Total Spans: ${analysis.spanCount}`);
console.log('');
// Service breakdown
console.log('📊 SERVICE BREAKDOWN');
console.log('-------------------');
console.log(`${'Service'.padEnd(20)} ${'Time'.padEnd(15)} ${'Calls'.padEnd(10)} ${'% of Total'.padEnd(15)}`);
console.log('-'.repeat(70));
for (const [service, data] of Object.entries(analysis.serviceBreakdown)) {
const percentage = ((data.totalTime / analysis.totalDuration) * 100).toFixed(1);
console.log(
`${service.padEnd(20)} ${formatDuration(data.totalTime).padEnd(15)} ${String(data.calls).padEnd(10)} ${percentage}%`
);
}
console.log('');
// Slow spans
if (analysis.slowSpans.length > 0) {
console.log(`🐌 SLOW SPANS (>${formatDuration(1000)})`);
console.log('-------------------');
console.log(`${'Service'.padEnd(15)} ${'Operation'.padEnd(30)} ${'Duration'.padEnd(15)} ${'% of Total'.padEnd(15)}`);
console.log('-'.repeat(80));
for (const span of analysis.slowSpans.slice(0, 10)) {
console.log(
`${span.service.padEnd(15)} ${span.operation.padEnd(30)} ${formatDuration(span.duration).padEnd(15)} ${span.percentage}%`
);
}
console.log('');
}
// Bottlenecks
if (analysis.bottlenecks.length > 0) {
console.log('🚨 BOTTLENECKS (>50% of total time)');
console.log('-----------------------------------');
for (const bottleneck of analysis.bottlenecks) {
console.log(`⚠️ ${bottleneck.service} - ${bottleneck.operation}`);
console.log(` Duration: ${formatDuration(bottleneck.duration)} (${bottleneck.percentage}% of trace)`);
console.log('');
}
}
// Recommendations
console.log('💡 RECOMMENDATIONS');
console.log('-----------------');
if (analysis.bottlenecks.length > 0) {
console.log('🔴 CRITICAL: Bottlenecks detected!');
for (const bottleneck of analysis.bottlenecks) {
console.log(` - Optimize ${bottleneck.service}.${bottleneck.operation} (${bottleneck.percentage}% of trace)`);
// Specific recommendations based on operation
if (bottleneck.operation.includes('db.query')) {
console.log(' → Add database index, optimize query, add caching');
} else if (bottleneck.operation.includes('http')) {
console.log(' → Add timeout, cache response, use async processing');
} else if (bottleneck.operation.includes('cache')) {
console.log(' → Check cache hit rate, optimize cache key');
}
}
} else if (analysis.slowSpans.length > 0) {
console.log('🟡 Some slow spans detected:');
for (const span of analysis.slowSpans.slice(0, 3)) {
console.log(` - ${span.service}.${span.operation}: ${formatDuration(span.duration)}`);
}
} else {
console.log('✅ No obvious performance issues detected.');
console.log(' All spans complete in reasonable time.');
}
console.log('');
console.log('Next steps:');
console.log(' - Profile slowest spans');
console.log(' - Check for N+1 queries, missing indexes');
console.log(' - Add caching where appropriate');
console.log(' - Review external API timeouts');
console.log('');
}
// Main
function main() {
const trace = loadTrace();
const analysis = analyzeTrace(trace);
if (outputFormat === 'json') {
console.log(JSON.stringify(analysis, null, 2));
} else {
printAnalysis(analysis);
}
}
main();