Initial commit
This commit is contained in:
213
agents/sre/scripts/log-analyzer.py
Executable file
213
agents/sre/scripts/log-analyzer.py
Executable file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
log-analyzer.py
|
||||
Parse application/system logs for error patterns and anomalies
|
||||
|
||||
Usage: python3 log-analyzer.py /var/log/application.log
|
||||
python3 log-analyzer.py /var/log/application.log --errors-only
|
||||
python3 log-analyzer.py /var/log/application.log --since "2025-10-26 14:00"
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime, timedelta
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Analyze log files for errors and patterns')
|
||||
parser.add_argument('logfile', help='Path to log file')
|
||||
parser.add_argument('--errors-only', action='store_true', help='Show only errors (ERROR, FATAL)')
|
||||
parser.add_argument('--warnings', action='store_true', help='Include warnings')
|
||||
parser.add_argument('--since', help='Show logs since timestamp (YYYY-MM-DD HH:MM)')
|
||||
parser.add_argument('--until', help='Show logs until timestamp (YYYY-MM-DD HH:MM)')
|
||||
parser.add_argument('--pattern', help='Search for specific pattern (regex)')
|
||||
parser.add_argument('--top', type=int, default=10, help='Show top N errors (default: 10)')
|
||||
return parser.parse_args()
|
||||
|
||||
def parse_log_line(line):
|
||||
"""Parse common log formats"""
|
||||
# Try different log formats
|
||||
patterns = [
|
||||
# JSON: {"timestamp":"2025-10-26T14:00:00Z","level":"ERROR","message":"..."}
|
||||
r'\{"timestamp":"(?P<timestamp>[^"]+)".*"level":"(?P<level>[^"]+)".*"message":"(?P<message>[^"]+)"',
|
||||
|
||||
# Standard: [2025-10-26 14:00:00] ERROR: message
|
||||
r'\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]\s+(?P<level>\w+):\s+(?P<message>.*)',
|
||||
|
||||
# Syslog: Oct 26 14:00:00 hostname application[1234]: ERROR message
|
||||
r'(?P<timestamp>\w+ \d+ \d{2}:\d{2}:\d{2})\s+\S+\s+\S+:\s+(?P<level>\w+)\s+(?P<message>.*)',
|
||||
|
||||
# Simple: 2025-10-26 14:00:00 ERROR message
|
||||
r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?P<level>\w+)\s+(?P<message>.*)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.match(pattern, line)
|
||||
if match:
|
||||
return match.groupdict()
|
||||
|
||||
# If no pattern matched, return raw line
|
||||
return {'timestamp': None, 'level': 'INFO', 'message': line.strip()}
|
||||
|
||||
def parse_timestamp(ts_str):
|
||||
"""Parse various timestamp formats"""
|
||||
if not ts_str:
|
||||
return None
|
||||
|
||||
formats = [
|
||||
'%Y-%m-%dT%H:%M:%SZ',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%b %d %H:%M:%S',
|
||||
]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(ts_str, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Parse filters
|
||||
since = datetime.strptime(args.since, '%Y-%m-%d %H:%M') if args.since else None
|
||||
until = datetime.strptime(args.until, '%Y-%m-%d %H:%M') if args.until else None
|
||||
|
||||
# Stats
|
||||
total_lines = 0
|
||||
error_count = 0
|
||||
warning_count = 0
|
||||
error_messages = Counter()
|
||||
errors_by_hour = defaultdict(int)
|
||||
error_timeline = []
|
||||
|
||||
print(f"Analyzing log file: {args.logfile}")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
try:
|
||||
with open(args.logfile, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
for line in f:
|
||||
total_lines += 1
|
||||
|
||||
# Parse log line
|
||||
parsed = parse_log_line(line)
|
||||
level = parsed.get('level', '').upper()
|
||||
message = parsed.get('message', '')
|
||||
timestamp = parse_timestamp(parsed.get('timestamp'))
|
||||
|
||||
# Filter by time range
|
||||
if since and timestamp and timestamp < since:
|
||||
continue
|
||||
if until and timestamp and timestamp > until:
|
||||
continue
|
||||
|
||||
# Filter by pattern
|
||||
if args.pattern and not re.search(args.pattern, message, re.IGNORECASE):
|
||||
continue
|
||||
|
||||
# Filter by level
|
||||
if args.errors_only and level not in ['ERROR', 'FATAL', 'CRITICAL']:
|
||||
continue
|
||||
|
||||
# Count errors and warnings
|
||||
if level in ['ERROR', 'FATAL', 'CRITICAL']:
|
||||
error_count += 1
|
||||
|
||||
# Extract error message (first 100 chars)
|
||||
error_key = message[:100] if len(message) > 100 else message
|
||||
error_messages[error_key] += 1
|
||||
|
||||
# Group by hour
|
||||
if timestamp:
|
||||
hour_key = timestamp.strftime('%Y-%m-%d %H:00')
|
||||
errors_by_hour[hour_key] += 1
|
||||
error_timeline.append((timestamp, message))
|
||||
|
||||
elif level in ['WARN', 'WARNING'] and args.warnings:
|
||||
warning_count += 1
|
||||
|
||||
# Print summary
|
||||
print(f"📊 SUMMARY")
|
||||
print(f"---------")
|
||||
print(f"Total lines: {total_lines:,}")
|
||||
print(f"Errors: {error_count:,}")
|
||||
if args.warnings:
|
||||
print(f"Warnings: {warning_count:,}")
|
||||
print()
|
||||
|
||||
# Top errors
|
||||
if error_messages:
|
||||
print(f"🔥 TOP {args.top} ERRORS")
|
||||
print(f"{'Count':<10} {'Message':<70}")
|
||||
print("-" * 80)
|
||||
for msg, count in error_messages.most_common(args.top):
|
||||
msg_short = (msg[:67] + '...') if len(msg) > 70 else msg
|
||||
print(f"{count:<10} {msg_short}")
|
||||
print()
|
||||
|
||||
# Errors by hour
|
||||
if errors_by_hour:
|
||||
print(f"📈 ERRORS BY HOUR")
|
||||
print(f"{'Hour':<20} {'Count':<10} {'Graph':<50}")
|
||||
print("-" * 80)
|
||||
|
||||
max_errors = max(errors_by_hour.values())
|
||||
for hour in sorted(errors_by_hour.keys()):
|
||||
count = errors_by_hour[hour]
|
||||
bar_length = int((count / max_errors) * 40)
|
||||
bar = '█' * bar_length
|
||||
print(f"{hour:<20} {count:<10} {bar}")
|
||||
print()
|
||||
|
||||
# Error timeline (last 20)
|
||||
if error_timeline:
|
||||
print(f"⏱️ ERROR TIMELINE (Last 20)")
|
||||
print(f"{'Timestamp':<20} {'Message':<60}")
|
||||
print("-" * 80)
|
||||
|
||||
for timestamp, message in sorted(error_timeline, reverse=True)[:20]:
|
||||
ts_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
msg_short = (message[:57] + '...') if len(message) > 60 else message
|
||||
print(f"{ts_str:<20} {msg_short}")
|
||||
print()
|
||||
|
||||
# Recommendations
|
||||
print(f"💡 RECOMMENDATIONS")
|
||||
print(f"-----------------")
|
||||
|
||||
if error_count == 0:
|
||||
print("✅ No errors found. System looks healthy!")
|
||||
elif error_count < 10:
|
||||
print(f"⚠️ {error_count} errors found. Review above for details.")
|
||||
elif error_count < 100:
|
||||
print(f"⚠️ {error_count} errors found. Investigate top errors.")
|
||||
else:
|
||||
print(f"🚨 {error_count} errors found! Immediate investigation required.")
|
||||
print(" - Check for cascading failures")
|
||||
print(" - Review error timeline for spike")
|
||||
print(" - Check related services")
|
||||
|
||||
if errors_by_hour:
|
||||
# Find hour with most errors
|
||||
peak_hour = max(errors_by_hour.items(), key=lambda x: x[1])
|
||||
print(f"\n📍 Peak error hour: {peak_hour[0]} ({peak_hour[1]} errors)")
|
||||
print(f" - Review what happened at this time")
|
||||
print(f" - Check deployment, traffic spike, external dependency")
|
||||
|
||||
print()
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Error: Log file not found: {args.logfile}")
|
||||
sys.exit(1)
|
||||
except PermissionError:
|
||||
print(f"❌ Error: Permission denied: {args.logfile}")
|
||||
print(f" Try: sudo python3 {sys.argv[0]} {args.logfile}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user