Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:20:21 +08:00
commit bbbaf7acad
63 changed files with 38552 additions and 0 deletions

View File

@@ -0,0 +1,230 @@
#!/bin/bash
# Purpose: Analyze log files for patterns, errors, and anomalies
# Version: 1.0.0
# Usage: ./analyze-logs.sh --file <log-file> [options]
# Returns: 0=success, 1=error, 2=invalid params
# Dependencies: awk, grep, sed, jq (optional for JSON logs)
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default values
LOG_FILE=""
PATTERN=""
LEVEL=""
CONTEXT_LINES=5
START_TIME=""
END_TIME=""
OUTPUT_FORMAT="text"
SINCE=""
# Help message
show_help() {
cat << EOF
Log Analysis Utility
Usage: $0 --file <log-file> [options]
Options:
--file FILE Log file to analyze (required)
--pattern REGEX Filter by regex pattern
--level LEVEL Filter by log level (ERROR|WARN|INFO|DEBUG)
--context N Show N lines before and after matches (default: 5)
--start TIME Start time (format: "YYYY-MM-DD HH:MM:SS")
--end TIME End time (format: "YYYY-MM-DD HH:MM:SS")
--since DURATION Time ago (e.g., "1 hour ago", "30 minutes ago")
--format FORMAT Output format: text|json (default: text)
-h, --help Show this help message
Examples:
# Find all errors in last hour
$0 --file app.log --level ERROR --since "1 hour ago"
# Find timeout errors with context
$0 --file app.log --pattern "timeout" --context 10
# Analyze specific timeframe
$0 --file app.log --start "2024-10-14 14:00:00" --end "2024-10-14 15:00:00"
EOF
exit 0
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--file)
LOG_FILE="$2"
shift 2
;;
--pattern)
PATTERN="$2"
shift 2
;;
--level)
LEVEL="$2"
shift 2
;;
--context)
CONTEXT_LINES="$2"
shift 2
;;
--start)
START_TIME="$2"
shift 2
;;
--end)
END_TIME="$2"
shift 2
;;
--since)
SINCE="$2"
shift 2
;;
--format)
OUTPUT_FORMAT="$2"
shift 2
;;
-h|--help)
show_help
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}" >&2
exit 2
;;
esac
done
# Validate required parameters
if [ -z "$LOG_FILE" ]; then
echo -e "${RED}Error: --file is required${NC}" >&2
echo "Use --help for usage information"
exit 2
fi
if [ ! -f "$LOG_FILE" ]; then
echo -e "${RED}Error: Log file not found: $LOG_FILE${NC}" >&2
exit 1
fi
# Functions
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Convert "since" to start time
if [ -n "$SINCE" ]; then
if command -v date &> /dev/null; then
START_TIME=$(date -d "$SINCE" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -v -1H '+%Y-%m-%d %H:%M:%S')
fi
fi
log_info "Analyzing log file: $LOG_FILE"
# Build grep command
GREP_CMD="cat '$LOG_FILE'"
# Time filtering
if [ -n "$START_TIME" ]; then
log_info "Filtering from: $START_TIME"
GREP_CMD="$GREP_CMD | awk '\$0 >= \"$START_TIME\"'"
fi
if [ -n "$END_TIME" ]; then
log_info "Filtering to: $END_TIME"
GREP_CMD="$GREP_CMD | awk '\$0 <= \"$END_TIME\"'"
fi
# Level filtering
if [ -n "$LEVEL" ]; then
log_info "Filtering by level: $LEVEL"
GREP_CMD="$GREP_CMD | grep -i '$LEVEL'"
fi
# Pattern filtering
if [ -n "$PATTERN" ]; then
log_info "Filtering by pattern: $PATTERN"
GREP_CMD="$GREP_CMD | grep -E '$PATTERN' -A $CONTEXT_LINES -B $CONTEXT_LINES"
fi
# Execute filtering
FILTERED_OUTPUT=$(eval "$GREP_CMD")
if [ -z "$FILTERED_OUTPUT" ]; then
log_warn "No matching log entries found"
exit 0
fi
# Count results
MATCH_COUNT=$(echo "$FILTERED_OUTPUT" | wc -l)
log_info "Found $MATCH_COUNT matching lines"
# Analysis
echo ""
echo "═══════════════════════════════════════════════════════════"
echo " LOG ANALYSIS RESULTS"
echo "═══════════════════════════════════════════════════════════"
echo ""
# Error statistics
echo "Error Statistics:"
echo "─────────────────────────────────────────────────────────"
ERROR_COUNT=$(echo "$FILTERED_OUTPUT" | grep -i "ERROR" | wc -l || echo "0")
WARN_COUNT=$(echo "$FILTERED_OUTPUT" | grep -i "WARN" | wc -l || echo "0")
INFO_COUNT=$(echo "$FILTERED_OUTPUT" | grep -i "INFO" | wc -l || echo "0")
echo " ERROR: $ERROR_COUNT"
echo " WARN: $WARN_COUNT"
echo " INFO: $INFO_COUNT"
echo ""
# Top errors
echo "Top Error Messages (Top 10):"
echo "─────────────────────────────────────────────────────────"
echo "$FILTERED_OUTPUT" | grep -i "ERROR" | awk -F'ERROR' '{print $2}' | sort | uniq -c | sort -rn | head -10 || echo " No errors found"
echo ""
# Time distribution (if timestamps present)
echo "Time Distribution:"
echo "─────────────────────────────────────────────────────────"
echo "$FILTERED_OUTPUT" | awk '{print substr($0, 1, 13)}' | sort | uniq -c | tail -20 || echo " No timestamp pattern detected"
echo ""
# Output filtered results
if [ "$OUTPUT_FORMAT" = "json" ]; then
log_info "Generating JSON output..."
# Simple JSON array of log lines
echo "{"
echo " \"file\": \"$LOG_FILE\","
echo " \"matches\": $MATCH_COUNT,"
echo " \"entries\": ["
echo "$FILTERED_OUTPUT" | awk '{printf " \"%s\",\n", $0}' | sed '$ s/,$//'
echo " ]"
echo "}"
else
echo "Matching Log Entries:"
echo "─────────────────────────────────────────────────────────"
echo "$FILTERED_OUTPUT"
fi
echo ""
log_success "Analysis complete"
exit 0

View File

@@ -0,0 +1,418 @@
#!/bin/bash
# Purpose: Monitor memory usage and detect leaks
# Version: 1.0.0
# Usage: ./memory-check.sh --app <app-name> [options]
# Returns: 0=success, 1=error, 2=invalid params
# Dependencies: ps, awk, bc
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default values
APP_NAME=""
DURATION=300
INTERVAL=10
THRESHOLD=1024
OUTPUT_DIR="./memory-check-output"
ALERT_ON_GROWTH=true
# Help message
show_help() {
cat << EOF
Memory Monitoring Utility
Usage: $0 --app <app-name> [options]
Options:
--app NAME Application/process name to monitor (required)
--duration N Monitoring duration in seconds (default: 300)
--interval N Sampling interval in seconds (default: 10)
--threshold MB Alert if memory exceeds threshold in MB (default: 1024)
--output DIR Output directory (default: ./memory-check-output)
--no-alert Disable growth alerts
-h, --help Show this help message
Examples:
# Monitor Node.js app for 5 minutes
$0 --app node --duration 300
# Monitor with custom threshold
$0 --app node --duration 600 --threshold 2048
# Quick check (1 minute)
$0 --app node --duration 60 --interval 5
EOF
exit 0
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--app)
APP_NAME="$2"
shift 2
;;
--duration)
DURATION="$2"
shift 2
;;
--interval)
INTERVAL="$2"
shift 2
;;
--threshold)
THRESHOLD="$2"
shift 2
;;
--output)
OUTPUT_DIR="$2"
shift 2
;;
--no-alert)
ALERT_ON_GROWTH=false
shift
;;
-h|--help)
show_help
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}" >&2
exit 2
;;
esac
done
# Validate required parameters
if [ -z "$APP_NAME" ]; then
echo -e "${RED}Error: --app is required${NC}" >&2
echo "Use --help for usage information"
exit 2
fi
# Functions
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
alert() {
echo -e "${RED}[ALERT]${NC} $1"
}
# Create output directory
mkdir -p "$OUTPUT_DIR"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
log_info "Starting memory monitoring for: $APP_NAME"
log_info "Duration: ${DURATION}s, Interval: ${INTERVAL}s, Threshold: ${THRESHOLD}MB"
log_info "Output directory: $OUTPUT_DIR"
# Find process ID
PIDS=$(pgrep -f "$APP_NAME" || echo "")
if [ -z "$PIDS" ]; then
log_error "No process found matching: $APP_NAME"
exit 1
fi
PID=$(echo "$PIDS" | head -1)
log_info "Found process: PID $PID"
# Output files
MEMORY_LOG="$OUTPUT_DIR/memory-log-$TIMESTAMP.txt"
CHART_FILE="$OUTPUT_DIR/memory-chart-$TIMESTAMP.txt"
REPORT_FILE="$OUTPUT_DIR/memory-report-$TIMESTAMP.txt"
# Write header
echo "Timestamp,RSS_KB,VSZ_KB,Percent_MEM" > "$MEMORY_LOG"
log_info "Monitoring memory usage..."
# Track min/max
MIN_RSS=0
MAX_RSS=0
READINGS=()
# Collect memory samples
SAMPLES=$((DURATION / INTERVAL))
for i in $(seq 1 $SAMPLES); do
# Get memory stats
MEM_STATS=$(ps -p "$PID" -o rss=,vsz=,%mem= 2>/dev/null || echo "")
if [ -z "$MEM_STATS" ]; then
log_error "Process $PID not found. It may have terminated."
break
fi
# Parse values
RSS=$(echo "$MEM_STATS" | awk '{print $1}')
VSZ=$(echo "$MEM_STATS" | awk '{print $2}')
PMEM=$(echo "$MEM_STATS" | awk '{print $3}')
TIMESTAMP_NOW=$(date '+%Y-%m-%d %H:%M:%S')
# Update min/max
if [ "$MIN_RSS" -eq 0 ] || [ "$RSS" -lt "$MIN_RSS" ]; then
MIN_RSS=$RSS
fi
if [ "$RSS" -gt "$MAX_RSS" ]; then
MAX_RSS=$RSS
fi
# Store reading
READINGS+=($RSS)
# Log to file
echo "$TIMESTAMP_NOW,$RSS,$VSZ,$PMEM" >> "$MEMORY_LOG"
# Convert to MB for display
RSS_MB=$(echo "scale=2; $RSS/1024" | bc)
VSZ_MB=$(echo "scale=2; $VSZ/1024" | bc)
# Progress display
echo -ne "\r Sample $i/$SAMPLES: RSS=${RSS_MB}MB, VSZ=${VSZ_MB}MB, %MEM=${PMEM}% "
# Check threshold
if (( $(echo "$RSS_MB > $THRESHOLD" | bc -l) )); then
echo "" # New line before alert
alert "Memory threshold exceeded: ${RSS_MB}MB > ${THRESHOLD}MB"
fi
sleep "$INTERVAL"
done
echo "" # New line after progress
log_success "Memory monitoring complete"
# Calculate statistics
MIN_MB=$(echo "scale=2; $MIN_RSS/1024" | bc)
MAX_MB=$(echo "scale=2; $MAX_RSS/1024" | bc)
GROWTH_MB=$(echo "scale=2; ($MAX_RSS-$MIN_RSS)/1024" | bc)
# Calculate average
TOTAL_RSS=0
for rss in "${READINGS[@]}"; do
TOTAL_RSS=$((TOTAL_RSS + rss))
done
AVG_RSS=$((TOTAL_RSS / ${#READINGS[@]}))
AVG_MB=$(echo "scale=2; $AVG_RSS/1024" | bc)
# Detect leak (memory consistently growing)
LEAK_DETECTED=false
if (( $(echo "$GROWTH_MB > 50" | bc -l) )); then
# Check if growth is consistent (not just spike)
FIRST_HALF_AVG=0
SECOND_HALF_AVG=0
MID_POINT=$((${#READINGS[@]} / 2))
for i in $(seq 0 $((MID_POINT - 1))); do
FIRST_HALF_AVG=$((FIRST_HALF_AVG + READINGS[$i]))
done
FIRST_HALF_AVG=$((FIRST_HALF_AVG / MID_POINT))
for i in $(seq $MID_POINT $((${#READINGS[@]} - 1))); do
SECOND_HALF_AVG=$((SECOND_HALF_AVG + READINGS[$i]))
done
SECOND_HALF_AVG=$((SECOND_HALF_AVG / (${#READINGS[@]} - MID_POINT)))
CONSISTENT_GROWTH=$((SECOND_HALF_AVG - FIRST_HALF_AVG))
CONSISTENT_GROWTH_MB=$(echo "scale=2; $CONSISTENT_GROWTH/1024" | bc)
if (( $(echo "$CONSISTENT_GROWTH_MB > 25" | bc -l) )); then
LEAK_DETECTED=true
fi
fi
# Generate ASCII chart
log_info "Generating memory chart..."
cat > "$CHART_FILE" << EOF
Memory Usage Over Time
═══════════════════════════════════════════════════════════
RSS (Resident Set Size) in MB
EOF
# Simple ASCII chart (40 rows, scale based on max)
CHART_HEIGHT=20
SCALE_FACTOR=$(echo "scale=2; $MAX_RSS / $CHART_HEIGHT" | bc)
for row in $(seq $CHART_HEIGHT -1 0); do
THRESHOLD_LINE=$(echo "scale=0; $row * $SCALE_FACTOR / 1024" | bc)
printf "%4d MB |" "$THRESHOLD_LINE"
for reading in "${READINGS[@]}"; do
READING_ROW=$(echo "scale=0; $reading / $SCALE_FACTOR" | bc)
if [ "$READING_ROW" -ge "$row" ]; then
printf "█"
else
printf " "
fi
done
echo ""
done
printf " +"
for i in $(seq 1 ${#READINGS[@]}); do printf "─"; done
echo ""
printf " "
for i in $(seq 1 ${#READINGS[@]}); do
if [ $((i % 10)) -eq 0 ]; then
printf "|"
else
printf " "
fi
done
echo ""
cat >> "$CHART_FILE" << EOF
Legend: Each column = ${INTERVAL}s interval
Total duration: ${DURATION}s
EOF
cat "$CHART_FILE"
# Generate report
log_info "Generating memory report..."
cat > "$REPORT_FILE" << EOF
═══════════════════════════════════════════════════════════
MEMORY MONITORING REPORT
═══════════════════════════════════════════════════════════
Application: $APP_NAME
PID: $PID
Duration: ${DURATION}s (${SAMPLES} samples)
Interval: ${INTERVAL}s
Timestamp: $TIMESTAMP
Memory Statistics:
─────────────────────────────────────────────────────────
Minimum RSS: ${MIN_MB} MB
Maximum RSS: ${MAX_MB} MB
Average RSS: ${AVG_MB} MB
Memory Growth: ${GROWTH_MB} MB
Threshold: ${THRESHOLD} MB
EOF
# Leak analysis
if [ "$LEAK_DETECTED" = true ]; then
cat >> "$REPORT_FILE" << EOF
⚠ MEMORY LEAK DETECTED
─────────────────────────────────────────────────────────
Memory grew consistently by ${CONSISTENT_GROWTH_MB} MB
First half average: $(echo "scale=2; $FIRST_HALF_AVG/1024" | bc) MB
Second half average: $(echo "scale=2; $SECOND_HALF_AVG/1024" | bc) MB
Recommendations:
1. Take heap snapshots for detailed analysis
2. Check for:
- Event listeners not removed
- Timers not cleared (setInterval, setTimeout)
- Unbounded caches or arrays
- Circular references
- Closures holding large objects
3. Use memory profiling tools:
- Node.js: node --inspect, heap snapshots
- Python: memory_profiler, tracemalloc
4. Consider using /debug memory operation for deeper analysis
EOF
if [ "$ALERT_ON_GROWTH" = true ]; then
alert "MEMORY LEAK DETECTED! Growth: ${CONSISTENT_GROWTH_MB} MB"
fi
else
cat >> "$REPORT_FILE" << EOF
✓ NO MEMORY LEAK DETECTED
─────────────────────────────────────────────────────────
Memory usage is stable
Growth of ${GROWTH_MB} MB is within acceptable range
EOF
log_success "No memory leak detected"
fi
# Threshold warnings
if (( $(echo "$MAX_MB > $THRESHOLD" | bc -l) )); then
cat >> "$REPORT_FILE" << EOF
⚠ THRESHOLD EXCEEDED
─────────────────────────────────────────────────────────
Peak memory (${MAX_MB} MB) exceeded threshold (${THRESHOLD} MB)
Recommendations:
1. Increase memory allocation if necessary
2. Optimize memory usage:
- Use streaming for large data
- Implement pagination
- Use efficient data structures
- Clear unused objects
3. Set appropriate container/VM memory limits
EOF
fi
# Output files
cat >> "$REPORT_FILE" << EOF
Output Files:
─────────────────────────────────────────────────────────
Memory Log: $MEMORY_LOG
Memory Chart: $CHART_FILE
This Report: $REPORT_FILE
Next Steps:
─────────────────────────────────────────────────────────
EOF
if [ "$LEAK_DETECTED" = true ]; then
cat >> "$REPORT_FILE" << EOF
1. Use /debug memory for heap profiling
2. Take heap snapshots before and after operations
3. Review code for common leak patterns
4. Monitor production with these findings
EOF
else
cat >> "$REPORT_FILE" << EOF
1. Continue monitoring in production
2. Set up alerts for memory threshold
3. Schedule periodic memory checks
EOF
fi
echo "" >> "$REPORT_FILE"
echo "═══════════════════════════════════════════════════════════" >> "$REPORT_FILE"
log_success "Report saved to: $REPORT_FILE"
# Display report
cat "$REPORT_FILE"
# Exit with appropriate code
if [ "$LEAK_DETECTED" = true ]; then
exit 1
else
exit 0
fi

View File

@@ -0,0 +1,297 @@
#!/bin/bash
# Purpose: Profile application performance (CPU, memory, I/O)
# Version: 1.0.0
# Usage: ./profile.sh --app <app-name> [options]
# Returns: 0=success, 1=error, 2=invalid params
# Dependencies: ps, top, pidstat (optional)
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default values
APP_NAME=""
DURATION=60
INTERVAL=1
OUTPUT_DIR="./profile-output"
PROFILE_TYPE="all"
ENDPOINT=""
# Help message
show_help() {
cat << EOF
Application Profiling Utility
Usage: $0 --app <app-name> [options]
Options:
--app NAME Application/process name to profile (required)
--duration N Profile duration in seconds (default: 60)
--interval N Sampling interval in seconds (default: 1)
--type TYPE Profile type: cpu|memory|io|all (default: all)
--endpoint URL Optional: HTTP endpoint to load test during profiling
--output DIR Output directory (default: ./profile-output)
-h, --help Show this help message
Examples:
# Profile Node.js app for 2 minutes
$0 --app node --duration 120
# Profile with load test
$0 --app node --duration 60 --endpoint http://localhost:3000/api/test
# Profile only CPU
$0 --app node --duration 30 --type cpu
EOF
exit 0
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--app)
APP_NAME="$2"
shift 2
;;
--duration)
DURATION="$2"
shift 2
;;
--interval)
INTERVAL="$2"
shift 2
;;
--type)
PROFILE_TYPE="$2"
shift 2
;;
--endpoint)
ENDPOINT="$2"
shift 2
;;
--output)
OUTPUT_DIR="$2"
shift 2
;;
-h|--help)
show_help
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}" >&2
exit 2
;;
esac
done
# Validate required parameters
if [ -z "$APP_NAME" ]; then
echo -e "${RED}Error: --app is required${NC}" >&2
echo "Use --help for usage information"
exit 2
fi
# Functions
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Create output directory
mkdir -p "$OUTPUT_DIR"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
log_info "Starting profiling for: $APP_NAME"
log_info "Duration: ${DURATION}s, Interval: ${INTERVAL}s"
log_info "Output directory: $OUTPUT_DIR"
# Find process ID
PIDS=$(pgrep -f "$APP_NAME" || echo "")
if [ -z "$PIDS" ]; then
log_error "No process found matching: $APP_NAME"
exit 1
fi
PID=$(echo "$PIDS" | head -1)
log_info "Found process: PID $PID"
# Start load test if endpoint provided
LOAD_TEST_PID=""
if [ -n "$ENDPOINT" ]; then
log_info "Starting load test on: $ENDPOINT"
if command -v ab &> /dev/null; then
# Use Apache Bench
ab -n 100000 -c 10 "$ENDPOINT" > "$OUTPUT_DIR/load-test-$TIMESTAMP.log" 2>&1 &
LOAD_TEST_PID=$!
log_info "Load test started (PID: $LOAD_TEST_PID)"
else
log_warn "Apache Bench (ab) not found, skipping load test"
fi
fi
# CPU Profiling
if [ "$PROFILE_TYPE" = "cpu" ] || [ "$PROFILE_TYPE" = "all" ]; then
log_info "Profiling CPU usage..."
CPU_OUTPUT="$OUTPUT_DIR/cpu-profile-$TIMESTAMP.txt"
# Collect CPU samples
for i in $(seq 1 $DURATION); do
ps -p "$PID" -o %cpu,rss,vsz,cmd >> "$CPU_OUTPUT" 2>/dev/null || true
sleep "$INTERVAL"
done
log_success "CPU profile saved to: $CPU_OUTPUT"
# Calculate statistics
AVG_CPU=$(awk 'NR>1 {sum+=$1; count++} END {if (count>0) print sum/count; else print 0}' "$CPU_OUTPUT")
MAX_CPU=$(awk 'NR>1 {if ($1>max) max=$1} END {print max+0}' "$CPU_OUTPUT")
echo "CPU Statistics:" > "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt"
echo " Average CPU: $AVG_CPU%" >> "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt"
echo " Peak CPU: $MAX_CPU%" >> "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt"
fi
# Memory Profiling
if [ "$PROFILE_TYPE" = "memory" ] || [ "$PROFILE_TYPE" = "all" ]; then
log_info "Profiling memory usage..."
MEM_OUTPUT="$OUTPUT_DIR/memory-profile-$TIMESTAMP.txt"
# Collect memory samples
for i in $(seq 1 $DURATION); do
ps -p "$PID" -o rss,vsz,%mem,cmd >> "$MEM_OUTPUT" 2>/dev/null || true
sleep "$INTERVAL"
done
log_success "Memory profile saved to: $MEM_OUTPUT"
# Calculate statistics
AVG_RSS=$(awk 'NR>1 {sum+=$1; count++} END {if (count>0) print sum/count; else print 0}' "$MEM_OUTPUT")
MAX_RSS=$(awk 'NR>1 {if ($1>max) max=$1} END {print max+0}' "$MEM_OUTPUT")
MIN_RSS=$(awk 'NR>1 {if (min=="") min=$1; if ($1<min) min=$1} END {print min+0}' "$MEM_OUTPUT")
echo "Memory Statistics:" > "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
echo " Average RSS: $(echo "scale=2; $AVG_RSS/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
echo " Peak RSS: $(echo "scale=2; $MAX_RSS/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
echo " Min RSS: $(echo "scale=2; $MIN_RSS/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
echo " Memory Growth: $(echo "scale=2; ($MAX_RSS-$MIN_RSS)/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
fi
# I/O Profiling
if [ "$PROFILE_TYPE" = "io" ] || [ "$PROFILE_TYPE" = "all" ]; then
log_info "Profiling I/O usage..."
IO_OUTPUT="$OUTPUT_DIR/io-profile-$TIMESTAMP.txt"
# Check if process has I/O stats available
if [ -f "/proc/$PID/io" ]; then
# Collect I/O samples
for i in $(seq 1 $DURATION); do
echo "=== Sample $i ===" >> "$IO_OUTPUT"
cat "/proc/$PID/io" >> "$IO_OUTPUT" 2>/dev/null || true
sleep "$INTERVAL"
done
log_success "I/O profile saved to: $IO_OUTPUT"
else
log_warn "I/O profiling not available for this process"
fi
fi
# Stop load test if running
if [ -n "$LOAD_TEST_PID" ]; then
log_info "Stopping load test..."
kill "$LOAD_TEST_PID" 2>/dev/null || true
wait "$LOAD_TEST_PID" 2>/dev/null || true
fi
# Generate summary report
REPORT_FILE="$OUTPUT_DIR/profile-report-$TIMESTAMP.txt"
cat > "$REPORT_FILE" << EOF
═══════════════════════════════════════════════════════════
PERFORMANCE PROFILE REPORT
═══════════════════════════════════════════════════════════
Application: $APP_NAME
PID: $PID
Duration: ${DURATION}s
Interval: ${INTERVAL}s
Timestamp: $TIMESTAMP
EOF
# Add CPU summary if available
if [ -f "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" ]; then
cat "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
fi
# Add memory summary if available
if [ -f "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt" ]; then
cat "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
fi
# Add recommendations
cat >> "$REPORT_FILE" << EOF
Recommendations:
─────────────────────────────────────────────────────────
EOF
if [ -f "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" ]; then
MAX_CPU=$(awk '/Peak CPU:/ {print $3}' "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" | sed 's/%//')
if [ -n "$MAX_CPU" ] && (( $(echo "$MAX_CPU > 80" | bc -l) )); then
echo " ⚠ High CPU usage detected (${MAX_CPU}%)" >> "$REPORT_FILE"
echo " - Consider optimizing CPU-intensive operations" >> "$REPORT_FILE"
echo " - Profile with flame graphs for detailed analysis" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
fi
fi
if [ -f "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt" ]; then
GROWTH=$(awk '/Memory Growth:/ {print $3}' "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt")
if [ -n "$GROWTH" ] && (( $(echo "$GROWTH > 100" | bc -l) )); then
echo " ⚠ Significant memory growth detected (${GROWTH} MB)" >> "$REPORT_FILE"
echo " - Possible memory leak" >> "$REPORT_FILE"
echo " - Use heap profiling to identify leak sources" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
fi
fi
cat >> "$REPORT_FILE" << EOF
Output Files:
─────────────────────────────────────────────────────────
EOF
ls -lh "$OUTPUT_DIR"/*-$TIMESTAMP.* >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "═══════════════════════════════════════════════════════════" >> "$REPORT_FILE"
log_success "Profile complete!"
log_info "Report saved to: $REPORT_FILE"
# Display summary
cat "$REPORT_FILE"
exit 0

596
commands/debug/README.md Normal file
View File

@@ -0,0 +1,596 @@
# Debug Skill - Comprehensive Debugging Toolkit
A professional-grade debugging skill for diagnosing, reproducing, fixing, analyzing, and optimizing complex issues across the entire application stack.
## Overview
The debug skill provides systematic debugging operations that work seamlessly with the **10x-fullstack-engineer** agent to deliver cross-stack debugging expertise, production-grade strategies, and prevention-focused solutions.
## Available Operations
### 1. **diagnose** - Comprehensive Diagnosis and Root Cause Analysis
Performs systematic diagnosis across all layers of the application stack to identify root causes of complex issues.
**Usage:**
```bash
/10x-fullstack-engineer:debug diagnose issue:"Users getting 500 errors on file upload" environment:"production" logs:"logs/app.log"
```
**Parameters:**
- `issue:"description"` (required) - Problem description
- `environment:"prod|staging|dev"` (optional) - Target environment
- `logs:"path"` (optional) - Log file location
- `reproduction:"steps"` (optional) - Steps to reproduce
- `impact:"severity"` (optional) - Issue severity
**What it does:**
- Collects diagnostic data from frontend, backend, database, and infrastructure
- Analyzes symptoms and patterns across all stack layers
- Forms and tests hypotheses systematically
- Identifies root cause with supporting evidence
- Provides actionable recommendations
**Output:**
- Executive summary of issue and root cause
- Detailed diagnostic data from each layer
- Hypothesis analysis with evidence
- Root cause explanation
- Recommended immediate actions and permanent fix
- Prevention measures (monitoring, testing, documentation)
---
### 2. **reproduce** - Create Reliable Reproduction Strategies
Develops reliable strategies to reproduce issues consistently, creating test cases and reproduction documentation.
**Usage:**
```bash
/10x-fullstack-engineer:debug reproduce issue:"Payment webhook fails intermittently" environment:"staging" data:"sample-webhook-payload.json"
```
**Parameters:**
- `issue:"description"` (required) - Issue to reproduce
- `environment:"prod|staging|dev"` (optional) - Environment context
- `data:"path"` (optional) - Test data location
- `steps:"description"` (optional) - Known reproduction steps
- `reliability:"percentage"` (optional) - Current reproduction rate
**What it does:**
- Gathers environment, data, and user context
- Creates local reproduction strategy
- Develops automated test cases (unit, integration, E2E)
- Tests scenario variations and edge cases
- Verifies reproduction reliability
- Documents comprehensive reproduction guide
**Output:**
- Reproduction reliability metrics
- Prerequisites and setup instructions
- Detailed reproduction steps (manual and automated)
- Automated test case code
- Scenario variations tested
- Troubleshooting guide for reproduction issues
---
### 3. **fix** - Implement Targeted Fixes with Verification
Implements targeted fixes with comprehensive verification, safeguards, and prevention measures.
**Usage:**
```bash
/10x-fullstack-engineer:debug fix issue:"Race condition in order processing" root_cause:"Missing transaction lock" verification:"run-integration-tests"
```
**Parameters:**
- `issue:"description"` (required) - Issue being fixed
- `root_cause:"cause"` (required) - Identified root cause
- `verification:"strategy"` (optional) - Verification approach
- `scope:"areas"` (optional) - Affected code areas
- `rollback:"plan"` (optional) - Rollback strategy
**What it does:**
- Designs appropriate fix pattern for the issue type
- Implements fix with safety measures
- Adds safeguards (validation, rate limiting, circuit breakers)
- Performs multi-level verification (unit, integration, load, production)
- Adds prevention measures (tests, monitoring, alerts)
- Documents fix and deployment plan
**Fix patterns supported:**
- Missing error handling
- Race conditions
- Memory leaks
- Missing validation
- N+1 query problems
- Configuration issues
- Infrastructure limits
**Output:**
- Detailed fix implementation with before/after code
- Safeguards added (validation, error handling, monitoring)
- Verification results at all levels
- Prevention measures (tests, alerts, documentation)
- Deployment plan with rollback strategy
- Files modified and commits made
---
### 4. **analyze-logs** - Deep Log Analysis with Pattern Detection
Performs deep log analysis with pattern detection, timeline correlation, and anomaly identification.
**Usage:**
```bash
/10x-fullstack-engineer:debug analyze-logs path:"logs/application.log" pattern:"ERROR.*timeout" timeframe:"last-24h"
```
**Parameters:**
- `path:"log-file-path"` (required) - Log file to analyze
- `pattern:"regex"` (optional) - Filter pattern
- `timeframe:"range"` (optional) - Time range to analyze
- `level:"error|warn|info"` (optional) - Log level filter
- `context:"lines"` (optional) - Context lines around matches
**What it does:**
- Discovers and filters relevant logs across all sources
- Detects error patterns and clusters similar errors
- Performs timeline analysis and event correlation
- Traces individual requests across services
- Identifies statistical anomalies and spikes
- Analyzes performance, user impact, and security issues
**Utility script:**
```bash
./commands/debug/.scripts/analyze-logs.sh \
--file logs/application.log \
--level ERROR \
--since "1 hour ago" \
--context 5
```
**Output:**
- Summary of findings with key statistics
- Top errors with frequency and patterns
- Timeline of critical events
- Request tracing through distributed system
- Anomaly detection (spikes, new errors)
- Performance analysis from logs
- User impact assessment
- Root cause analysis based on log patterns
- Recommendations for fixes and monitoring
---
### 5. **performance** - Performance Debugging and Optimization
Debugs performance issues through profiling, bottleneck identification, and targeted optimization.
**Usage:**
```bash
/10x-fullstack-engineer:debug performance component:"api-endpoint:/orders" metric:"response-time" threshold:"200ms"
```
**Parameters:**
- `component:"name"` (required) - Component to profile
- `metric:"type"` (optional) - Metric to measure (response-time, throughput, cpu, memory)
- `threshold:"value"` (optional) - Target performance threshold
- `duration:"period"` (optional) - Profiling duration
- `load:"users"` (optional) - Concurrent users for load testing
**What it does:**
- Establishes performance baseline
- Profiles application, database, and network
- Identifies bottlenecks (CPU, I/O, memory, network)
- Implements targeted optimizations (queries, caching, algorithms, async)
- Performs load testing to verify improvements
- Sets up performance monitoring
**Profiling utility script:**
```bash
./commands/debug/.scripts/profile.sh \
--app node_app \
--duration 60 \
--endpoint http://localhost:3000/api/slow
```
**Optimization strategies:**
- Query optimization (indexes, query rewriting)
- Caching (application-level, Redis)
- Code optimization (algorithms, lazy loading, pagination)
- Async optimization (parallel execution, batching)
**Output:**
- Performance baseline and after-optimization metrics
- Bottlenecks identified with evidence
- Optimizations implemented with code changes
- Load testing results
- Performance improvement percentages
- Monitoring setup (metrics, dashboards, alerts)
- Recommendations for additional optimizations
---
### 6. **memory** - Memory Leak Detection and Optimization
Detects memory leaks, analyzes memory usage patterns, and optimizes memory consumption.
**Usage:**
```bash
/10x-fullstack-engineer:debug memory component:"background-worker" symptom:"growing-heap" duration:"6h"
```
**Parameters:**
- `component:"name"` (required) - Component to analyze
- `symptom:"type"` (optional) - Memory symptom (growing-heap, high-usage, oom)
- `duration:"period"` (optional) - Observation period
- `threshold:"max-mb"` (optional) - Memory threshold in MB
- `profile:"type"` (optional) - Profile type (heap, allocation)
**What it does:**
- Identifies memory symptoms (leaks, high usage, OOM)
- Captures memory profiles (heap snapshots, allocation tracking)
- Analyzes common leak patterns
- Implements memory optimizations
- Performs leak verification under load
- Tunes garbage collection
**Memory check utility script:**
```bash
./commands/debug/.scripts/memory-check.sh \
--app node_app \
--duration 300 \
--interval 10 \
--threshold 1024
```
**Common leak patterns detected:**
- Event listeners not removed
- Timers not cleared
- Closures holding references
- Unbounded caches
- Global variable accumulation
- Detached DOM nodes
- Infinite promise chains
**Optimization techniques:**
- Stream large data instead of loading into memory
- Use efficient data structures (Map vs Array)
- Paginate database queries
- Implement LRU caches with size limits
- Use weak references where appropriate
- Object pooling for frequently created objects
**Output:**
- Memory symptoms and baseline metrics
- Heap snapshot analysis
- Memory leaks identified with evidence
- Fixes implemented with before/after code
- Memory after fixes with improvement percentages
- Memory stability test results
- Garbage collection metrics
- Monitoring setup and alerts
- Recommendations for memory limits and future monitoring
---
## Utility Scripts
The debug skill includes three utility scripts in `.scripts/` directory:
### analyze-logs.sh
**Purpose:** Analyze log files for patterns, errors, and anomalies
**Features:**
- Pattern matching with regex
- Log level filtering
- Time-based filtering
- Context lines around matches
- Error statistics and top errors
- Time distribution analysis
- JSON output support
### profile.sh
**Purpose:** Profile application performance (CPU, memory, I/O)
**Features:**
- CPU profiling with statistics
- Memory profiling with growth detection
- I/O profiling
- Concurrent load testing
- Automated recommendations
- Comprehensive reports
### memory-check.sh
**Purpose:** Monitor memory usage and detect leaks
**Features:**
- Real-time memory monitoring
- Memory growth detection
- Leak detection with trend analysis
- ASCII memory usage charts
- Threshold alerts
- Detailed memory reports
---
## Common Debugging Workflows
### Workflow 1: Production Error Investigation
```bash
# Step 1: Diagnose the issue
/10x-fullstack-engineer:debug diagnose issue:"500 errors on checkout" environment:"production" logs:"logs/app.log"
# Step 2: Analyze logs for patterns
/10x-fullstack-engineer:debug analyze-logs path:"logs/app.log" pattern:"checkout.*ERROR" timeframe:"last-1h"
# Step 3: Reproduce locally
/10x-fullstack-engineer:debug reproduce issue:"Checkout fails with 500" environment:"staging" data:"test-checkout.json"
# Step 4: Implement fix
/10x-fullstack-engineer:debug fix issue:"Database timeout on checkout" root_cause:"Missing connection pool configuration"
```
### Workflow 2: Performance Degradation
```bash
# Step 1: Profile performance
/10x-fullstack-engineer:debug performance component:"api-endpoint:/checkout" metric:"response-time" threshold:"500ms"
# Step 2: Analyze slow queries
/10x-fullstack-engineer:debug analyze-logs path:"logs/postgresql.log" pattern:"duration:.*[0-9]{4,}"
# Step 3: Implement optimization
/10x-fullstack-engineer:debug fix issue:"Slow checkout API" root_cause:"N+1 query on order items"
```
### Workflow 3: Memory Leak Investigation
```bash
# Step 1: Diagnose memory symptoms
/10x-fullstack-engineer:debug diagnose issue:"Memory grows over time" environment:"production"
# Step 2: Profile memory usage
/10x-fullstack-engineer:debug memory component:"background-processor" symptom:"growing-heap" duration:"1h"
# Step 3: Implement fix
/10x-fullstack-engineer:debug fix issue:"Memory leak in event handlers" root_cause:"Event listeners not removed"
```
### Workflow 4: Intermittent Failure
```bash
# Step 1: Reproduce reliably
/10x-fullstack-engineer:debug reproduce issue:"Random payment failures" environment:"staging"
# Step 2: Diagnose with reproduction
/10x-fullstack-engineer:debug diagnose issue:"Payment webhook fails intermittently" reproduction:"steps-from-reproduce"
# Step 3: Analyze timing
/10x-fullstack-engineer:debug analyze-logs path:"logs/webhooks.log" pattern:"payment.*fail" context:10
# Step 4: Fix race condition
/10x-fullstack-engineer:debug fix issue:"Race condition in webhook handler" root_cause:"Concurrent webhook processing"
```
---
## Integration with 10x-fullstack-engineer Agent
All debugging operations are designed to work with the **10x-fullstack-engineer** agent, which provides:
- **Cross-stack debugging expertise** - Systematic analysis across frontend, backend, database, and infrastructure
- **Systematic root cause analysis** - Hypothesis formation, testing, and evidence-based conclusions
- **Production-grade debugging strategies** - Safe, reliable approaches suitable for production environments
- **Performance and security awareness** - Considers performance impact and security implications
- **Prevention-focused mindset** - Not just fixing issues, but preventing future occurrences
The agent brings deep expertise in:
- Full-stack architecture patterns
- Performance optimization techniques
- Memory management and leak detection
- Database query optimization
- Distributed systems debugging
- Production safety and deployment strategies
---
## Debugging Best Practices
### 1. Start with Diagnosis
Always begin with `/debug diagnose` to understand the full scope of the issue before attempting fixes.
### 2. Reproduce Reliably
Use `/debug reproduce` to create reproducible test cases. A bug that can't be reliably reproduced is hard to fix and verify.
### 3. Analyze Logs Systematically
Use `/debug analyze-logs` to find patterns and correlations. Look for:
- Error frequency and distribution
- Timeline correlation with deployments
- Anomalies and spikes
- Request tracing across services
### 4. Profile Before Optimizing
Use `/debug performance` and `/debug memory` to identify actual bottlenecks. Don't optimize based on assumptions.
### 5. Fix with Verification
Use `/debug fix` which includes:
- Proper error handling
- Comprehensive testing
- Monitoring and alerts
- Documentation
### 6. Add Prevention Measures
Every fix should include:
- Regression tests
- Monitoring metrics
- Alerts on thresholds
- Documentation updates
---
## Output Documentation
Each operation generates comprehensive reports in markdown format:
- **Executive summaries** for stakeholders
- **Detailed technical analysis** for engineers
- **Code snippets** with before/after comparisons
- **Evidence and metrics** supporting conclusions
- **Actionable recommendations** with priorities
- **Next steps** with clear instructions
Reports include:
- Issue description and symptoms
- Analysis methodology and findings
- Root cause explanation with evidence
- Fixes implemented with code
- Verification results
- Prevention measures added
- Files modified and commits
- Monitoring and alerting setup
---
## Error Handling
All operations include robust error handling:
- **Insufficient information** - Lists what's needed and how to gather it
- **Cannot reproduce** - Suggests alternative debugging approaches
- **Fix verification fails** - Provides re-diagnosis steps
- **Optimization degrades performance** - Includes rollback procedures
- **Environment differences** - Helps bridge local vs production gaps
---
## Common Debugging Scenarios
### Database Performance Issues
1. Use `/debug performance` to establish baseline
2. Use `/debug analyze-logs` on database slow query logs
3. Identify missing indexes or inefficient queries
4. Use `/debug fix` to implement optimization
5. Verify with load testing
### Memory Leaks
1. Use `/debug diagnose` to identify symptoms
2. Use `/debug memory` to capture heap profiles
3. Identify leak patterns (event listeners, timers, caches)
4. Use `/debug fix` to implement cleanup
5. Verify with sustained load testing
### Intermittent Errors
1. Use `/debug analyze-logs` to find error patterns
2. Use `/debug reproduce` to create reliable reproduction
3. Use `/debug diagnose` with reproduction steps
4. Identify timing or concurrency issues
5. Use `/debug fix` to implement proper synchronization
### Production Incidents
1. Use `/debug diagnose` for rapid root cause analysis
2. Use `/debug analyze-logs` for recent time period
3. Implement immediate mitigation (rollback, circuit breaker)
4. Use `/debug reproduce` to prevent recurrence
5. Use `/debug fix` for permanent solution
### Performance Degradation
1. Use `/debug performance` to compare against baseline
2. Identify bottlenecks (CPU, I/O, memory, network)
3. Use `/debug analyze-logs` for slow operations
4. Implement targeted optimizations
5. Verify improvements with load testing
---
## Tips and Tricks
### Effective Log Analysis
- Use pattern matching to find related errors
- Look for request IDs to trace across services
- Check timestamps for correlation with deployments
- Compare error rates before and after changes
- Use context lines to understand error conditions
### Performance Profiling
- Profile production-like workloads
- Use realistic data sizes
- Test under sustained load, not just peak
- Profile both CPU and memory together
- Use flame graphs for visual analysis
### Memory Debugging
- Force GC between measurements for accuracy
- Take multiple heap snapshots over time
- Look for objects that never get collected
- Check for consistent growth, not just spikes
- Verify fixes with extended monitoring
### Reproduction Strategies
- Minimize reproduction to essential steps
- Control timing with explicit delays
- Use specific test data that triggers issue
- Document environment differences
- Aim for >80% reproduction reliability
---
## File Locations
```
plugins/10x-fullstack-engineer/commands/debug/
├── skill.md # Router/orchestrator
├── diagnose.md # Diagnosis operation
├── reproduce.md # Reproduction operation
├── fix.md # Fix implementation operation
├── analyze-logs.md # Log analysis operation
├── performance.md # Performance debugging operation
├── memory.md # Memory debugging operation
├── .scripts/
│ ├── analyze-logs.sh # Log analysis utility
│ ├── profile.sh # Performance profiling utility
│ └── memory-check.sh # Memory monitoring utility
└── README.md # This file
```
---
## Requirements
- **Node.js operations**: Node.js runtime with `--inspect` or `--prof` flags for profiling
- **Log analysis**: Standard Unix tools (awk, grep, sed), optional jq for JSON logs
- **Performance profiling**: Apache Bench (ab), k6, or Artillery for load testing
- **Memory profiling**: Chrome DevTools, clinic.js, or memwatch for Node.js
- **Database profiling**: Access to database query logs and EXPLAIN ANALYZE capability
---
## Support and Troubleshooting
If operations fail:
1. Check that required parameters are provided
2. Verify file paths and permissions
3. Ensure utility scripts are executable (`chmod +x .scripts/*.sh`)
4. Check that prerequisite tools are installed
5. Review error messages for specific issues
For complex debugging scenarios:
- Start with `/debug diagnose` for systematic analysis
- Use multiple operations in sequence for comprehensive investigation
- Leverage the 10x-fullstack-engineer agent's expertise
- Document findings and share with team
---
## Version
Debug Skill v1.0.0
---
## License
Part of the 10x-fullstack-engineer plugin for Claude Code.

View File

@@ -0,0 +1,842 @@
# Analyze Logs Operation - Deep Log Analysis
You are executing the **analyze-logs** operation to perform deep log analysis with pattern detection, timeline correlation, and anomaly identification.
## Parameters
**Received**: `$ARGUMENTS` (after removing 'analyze-logs' operation name)
Expected format: `path:"log-file-path" [pattern:"regex-pattern"] [timeframe:"time-range"] [level:"error|warn|info"] [context:"lines-before-after"]`
## Workflow
### 1. Discover and Locate Logs
Identify all relevant log sources:
**Application Logs**:
```bash
# Common log locations
ls -lh /var/log/application/
ls -lh logs/
ls -lh ~/.pm2/logs/
# Find log files
find /var/log -name "*.log" -type f
find . -name "*.log" -mtime -1 # Modified in last 24 hours
# Check log rotation
ls -lh /var/log/application/*.log*
zcat /var/log/application/app.log.*.gz # Read rotated logs
```
**System Logs**:
```bash
# Systemd service logs
journalctl -u application.service --since "1 hour ago"
journalctl -u application.service --since "2024-10-14 14:00:00"
# Syslog
tail -f /var/log/syslog
tail -f /var/log/messages
# Kernel logs
dmesg -T
```
**Container Logs**:
```bash
# Docker
docker logs container-name --since 1h
docker logs container-name --timestamps
docker logs --tail 1000 container-name > container-logs.txt
# Kubernetes
kubectl logs pod-name -c container-name
kubectl logs pod-name --previous # Previous container
kubectl logs -l app=myapp --all-containers=true
```
**Web Server Logs**:
```bash
# Nginx
tail -f /var/log/nginx/access.log
tail -f /var/log/nginx/error.log
# Apache
tail -f /var/log/apache2/access.log
tail -f /var/log/apache2/error.log
```
**Database Logs**:
```bash
# PostgreSQL
tail -f /var/log/postgresql/postgresql-*.log
# MySQL
tail -f /var/log/mysql/error.log
tail -f /var/log/mysql/slow-query.log
# MongoDB
tail -f /var/log/mongodb/mongod.log
```
### 2. Filter and Extract Relevant Logs
Use the `.scripts/analyze-logs.sh` utility to extract relevant log entries:
**Basic Extraction**:
```bash
# Extract errors from last hour
./commands/debug/.scripts/analyze-logs.sh \
--file logs/application.log \
--level ERROR \
--since "1 hour ago"
# Extract with pattern matching
./commands/debug/.scripts/analyze-logs.sh \
--file logs/application.log \
--pattern "timeout|connection.*refused" \
--context 5
# Extract specific timeframe
./commands/debug/.scripts/analyze-logs.sh \
--file logs/application.log \
--start "2024-10-14 14:00:00" \
--end "2024-10-14 15:00:00"
```
**Manual Filtering**:
```bash
# Find errors with context
grep -i "error" logs/application.log -A 5 -B 5
# Find specific error patterns
grep -E "(timeout|refused|failed)" logs/application.log
# Find errors in timeframe
awk '/2024-10-14 14:/ && /ERROR/ {print}' logs/application.log
# Count errors by type
grep "ERROR" logs/application.log | awk '{print $5}' | sort | uniq -c | sort -rn
# Extract JSON logs with jq
cat logs/application.log | jq 'select(.level == "error")'
cat logs/application.log | jq 'select(.message | contains("timeout"))'
```
### 3. Pattern Detection
Identify patterns in log data:
#### Error Patterns
**Frequency Analysis**:
```bash
# Error frequency over time
grep "ERROR" logs/application.log | \
awk '{print $1, $2}' | \
cut -d: -f1 | \
uniq -c
# Most common errors
grep "ERROR" logs/application.log | \
awk -F'ERROR' '{print $2}' | \
sort | uniq -c | sort -rn | head -20
# Error rate calculation
total_lines=$(wc -l < logs/application.log)
error_lines=$(grep -c "ERROR" logs/application.log)
echo "Error rate: $(echo "scale=4; $error_lines / $total_lines * 100" | bc)%"
```
**Error Clustering**:
```python
# Group similar errors
import re
from collections import Counter
def normalize_error(error_msg):
# Remove numbers, IDs, timestamps
error_msg = re.sub(r'\d+', 'N', error_msg)
error_msg = re.sub(r'[a-f0-9-]{36}', 'UUID', error_msg)
error_msg = re.sub(r'\d{4}-\d{2}-\d{2}', 'DATE', error_msg)
return error_msg
errors = []
with open('logs/application.log') as f:
for line in f:
if 'ERROR' in line:
normalized = normalize_error(line)
errors.append(normalized)
# Count error types
error_counts = Counter(errors)
for error, count in error_counts.most_common(10):
print(f"{count}: {error}")
```
#### Request Patterns
**Request Analysis**:
```bash
# Requests per minute
awk '{print $1}' /var/log/nginx/access.log | \
cut -d: -f1-2 | \
uniq -c
# Most requested endpoints
awk '{print $7}' /var/log/nginx/access.log | \
sort | uniq -c | sort -rn | head -20
# Response code distribution
awk '{print $9}' /var/log/nginx/access.log | \
sort | uniq -c | sort -rn
# Slow requests (>1 second)
awk '$10 > 1.0 {print $0}' /var/log/nginx/access.log
# Top user agents
awk -F'"' '{print $6}' /var/log/nginx/access.log | \
sort | uniq -c | sort -rn | head -10
```
#### Performance Patterns
**Response Time Analysis**:
```bash
# Average response time
awk '{sum+=$10; count++} END {print "Average:", sum/count}' \
/var/log/nginx/access.log
# Response time percentiles
awk '{print $10}' /var/log/nginx/access.log | \
sort -n | \
awk '{
times[NR] = $1
}
END {
print "P50:", times[int(NR*0.5)]
print "P95:", times[int(NR*0.95)]
print "P99:", times[int(NR*0.99)]
}'
# Response time over time
awk '{print $4, $10}' /var/log/nginx/access.log | \
awk -F'[:]' '{print $1":"$2, $NF}' | \
awk '{sum[$1]+=$2; count[$1]++} END {
for (time in sum) print time, sum[time]/count[time]
}' | sort
```
### 4. Timeline Analysis
Create timeline of events:
**Timeline Construction**:
```bash
# Merge multiple log sources by timestamp
sort -m -k1,2 \
logs/application.log \
logs/database.log \
logs/nginx.log \
> merged-timeline.log
# Extract timeline around specific event
event_time="2024-10-14 14:30:15"
grep "$event_time" logs/application.log -B 100 -A 100
# Timeline with multiple sources
for log in logs/*.log; do
echo "=== $(basename $log) ==="
grep "$event_time" "$log" -B 10 -A 10
echo ""
done
```
**Event Correlation**:
```python
# Correlate events across log sources
import re
from datetime import datetime, timedelta
def parse_log_line(line):
# Extract timestamp and message
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', line)
if match:
timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
return timestamp, line
return None, None
# Load events from multiple logs
events = []
for log_file in ['app.log', 'db.log', 'nginx.log']:
with open(f'logs/{log_file}') as f:
for line in f:
timestamp, message = parse_log_line(line)
if timestamp:
events.append((timestamp, log_file, message))
# Sort by timestamp
events.sort(key=lambda x: x[0])
# Find events within time window
def find_related_events(target_time, window_seconds=10):
window = timedelta(seconds=window_seconds)
start_time = target_time - window
end_time = target_time + window
related = [
event for event in events
if start_time <= event[0] <= end_time
]
return related
# Analyze error event
error_time = datetime(2024, 10, 14, 14, 30, 15)
related = find_related_events(error_time)
for timestamp, source, message in related:
print(f"[{source}] {timestamp}: {message.strip()}")
```
### 5. Request Tracing
Trace individual requests across services:
**Request ID Tracing**:
```bash
# Extract request ID from error
error_line=$(grep "ERROR" logs/application.log | head -1)
request_id=$(echo "$error_line" | grep -oP 'request_id=\K[a-f0-9-]+')
echo "Tracing request: $request_id"
# Find all log entries for this request
grep "$request_id" logs/application.log
# Across multiple services
for log in logs/*.log; do
echo "=== $(basename $log) ==="
grep "$request_id" "$log"
done
# With timestamps for timeline
grep "$request_id" logs/*.log | sort -k1,2
```
**Distributed Tracing Correlation**:
```bash
# Extract trace ID from logs
trace_id=$(grep "ERROR" logs/application.log | \
head -1 | \
grep -oP 'trace_id=\K[a-f0-9]+')
# Query distributed tracing system
# Jaeger
curl "http://jaeger:16686/api/traces/$trace_id"
# Zipkin
curl "http://zipkin:9411/api/v2/trace/$trace_id"
```
### 6. Anomaly Detection
Identify unusual patterns:
**Statistical Anomalies**:
```python
import statistics
from collections import defaultdict
# Analyze error rates per hour
hourly_errors = defaultdict(int)
with open('logs/application.log') as f:
for line in f:
if 'ERROR' in line:
# Extract hour
hour = line[:13] # YYYY-MM-DD HH
hourly_errors[hour] += 1
# Calculate statistics
error_counts = list(hourly_errors.values())
mean = statistics.mean(error_counts)
stdev = statistics.stdev(error_counts)
# Find anomalies (>2 standard deviations)
print("Anomalous hours (>2 std dev from mean):")
for hour, count in sorted(hourly_errors.items()):
z_score = (count - mean) / stdev
if abs(z_score) > 2:
print(f"{hour}: {count} errors (z-score: {z_score:.2f})")
```
**New Error Types**:
```bash
# Compare today's errors with baseline
grep "ERROR" logs/application.log.1 | \
awk -F'ERROR' '{print $2}' | \
sort -u > baseline_errors.txt
grep "ERROR" logs/application.log | \
awk -F'ERROR' '{print $2}' | \
sort -u > current_errors.txt
# Find new error types
comm -13 baseline_errors.txt current_errors.txt > new_errors.txt
echo "New error types detected:"
cat new_errors.txt
```
**Spike Detection**:
```python
# Detect sudden spikes in error rate
from collections import deque
def detect_spikes(values, window_size=10, threshold=3):
"""Detect values that are >threshold times the rolling average"""
window = deque(maxlen=window_size)
spikes = []
for i, value in enumerate(values):
if len(window) == window_size:
avg = sum(window) / len(window)
if value > avg * threshold:
spikes.append((i, value, avg))
window.append(value)
return spikes
# Analyze minute-by-minute error counts
minute_errors = {} # {minute: error_count}
with open('logs/application.log') as f:
for line in f:
if 'ERROR' in line:
minute = line[:16] # YYYY-MM-DD HH:MM
minute_errors[minute] = minute_errors.get(minute, 0) + 1
# Detect spikes
error_counts = [minute_errors.get(m, 0) for m in sorted(minute_errors.keys())]
spikes = detect_spikes(error_counts, window_size=10, threshold=3)
print("Error spikes detected:")
for idx, value, avg in spikes:
print(f"Minute {idx}: {value} errors (avg was {avg:.1f})")
```
### 7. Performance Analysis
Analyze performance from logs:
**Slow Query Analysis**:
```bash
# PostgreSQL slow query log
cat /var/log/postgresql/postgresql.log | \
grep "duration:" | \
awk '{print $13, $0}' | \
sort -rn | \
head -20
# Extract slow queries
awk '/duration:/ && $13 > 1000 {print $0}' \
/var/log/postgresql/postgresql.log
```
**Endpoint Performance**:
```bash
# Average response time per endpoint
awk '{endpoint[$7] += $10; count[$7]++}
END {
for (e in endpoint) {
printf "%s: %.2fms\n", e, endpoint[e]/count[e]
}
}' /var/log/nginx/access.log | sort -t: -k2 -rn
# Slowest endpoints
awk '{print $10, $7}' /var/log/nginx/access.log | \
sort -rn | \
head -20
```
### 8. User Impact Analysis
Assess user-facing impact:
**Affected Users**:
```bash
# Extract unique users experiencing errors
grep "ERROR" logs/application.log | \
grep -oP 'user_id=\K[a-zA-Z0-9]+' | \
sort -u | \
wc -l
# Error rate by user
grep "ERROR" logs/application.log | \
grep -oP 'user_id=\K[a-zA-Z0-9]+' | \
sort | uniq -c | sort -rn | head -20
# Users with most errors
grep "user_id=" logs/application.log | \
awk '{
total[$0]++
if (/ERROR/) errors[$0]++
}
END {
for (user in total) {
print user, errors[user]/total[user]*100"%"
}
}' | sort -t% -k2 -rn
```
**Failed Requests**:
```bash
# 5xx errors
grep " 5[0-9][0-9] " /var/log/nginx/access.log
# Failed endpoints
awk '$9 >= 500 {print $7}' /var/log/nginx/access.log | \
sort | uniq -c | sort -rn
# Failed request details
awk '$9 >= 500 {print $4, $7, $9, $10}' \
/var/log/nginx/access.log
```
### 9. Resource Usage from Logs
Extract resource usage patterns:
**Memory Usage**:
```bash
# Extract memory logs
grep -i "memory\|heap\|oom" logs/application.log
# Parse memory usage
grep "heap_used" logs/application.log | \
awk '{print $1, $2, $NF}' | \
sed 's/MB$//'
```
**Connection Pool**:
```bash
# Database connection logs
grep "connection" logs/application.log | \
grep -oP 'pool_size=\K\d+|active=\K\d+|idle=\K\d+'
# Connection exhaustion
grep "connection.*timeout\|pool.*exhausted" logs/application.log -A 5
```
### 10. Security Analysis
Look for security-related issues:
**Authentication Failures**:
```bash
# Failed login attempts
grep -i "authentication.*failed\|login.*failed" logs/application.log
# By IP address
grep "authentication.*failed" logs/application.log | \
grep -oP 'ip=\K[\d.]+' | \
sort | uniq -c | sort -rn
# Brute force detection
grep "authentication.*failed" logs/application.log | \
grep -oP 'ip=\K[\d.]+' | \
uniq -c | \
awk '$1 > 10 {print $2, $1 " attempts"}'
```
**Suspicious Patterns**:
```bash
# SQL injection attempts
grep -iE "union.*select|drop.*table|; --" /var/log/nginx/access.log
# Path traversal attempts
grep -E "\.\./|\.\.%2F" /var/log/nginx/access.log
# XSS attempts
grep -iE "<script|javascript:|onerror=" /var/log/nginx/access.log
# Command injection attempts
grep -E ";\s*(cat|ls|wget|curl)" /var/log/nginx/access.log
```
## Output Format
```markdown
# Log Analysis Report: [Issue/Time Period]
## Summary
[High-level summary of findings]
## Analysis Period
- **Start**: [start timestamp]
- **End**: [end timestamp]
- **Duration**: [duration]
- **Log Sources**: [list of logs analyzed]
- **Total Lines**: [number of log lines]
## Key Findings
### Error Analysis
- **Total Errors**: [count]
- **Error Rate**: [percentage]%
- **Error Types**: [number of unique error types]
- **Most Common Error**: [error type] ([count] occurrences)
### Top Errors
1. **[Error Type 1]** - [count] occurrences
```
[sample log line]
```
- First seen: [timestamp]
- Last seen: [timestamp]
- Peak: [timestamp with highest frequency]
2. **[Error Type 2]** - [count] occurrences
```
[sample log line]
```
- [similar details]
### Patterns Detected
#### Pattern 1: [Pattern Name]
- **Description**: [what pattern is]
- **Frequency**: [how often it occurs]
- **Impact**: [user/system impact]
- **Example**:
```
[log excerpt showing pattern]
```
#### Pattern 2: [Pattern Name]
[similar structure]
## Timeline Analysis
### Critical Events Timeline
\`\`\`
14:25:30 [APP] Normal operation, avg response time 50ms
14:28:45 [APP] Response time increasing to 150ms
14:29:10 [DB] Connection pool usage at 90%
14:29:30 [APP] First timeout errors appear
14:29:45 [DB] Connection pool exhausted
14:30:00 [APP] Error rate spikes to 25%
14:30:15 [APP] Circuit breaker opens
14:30:30 [OPS] Auto-scaling triggers
14:32:00 [APP] New instances online
14:33:00 [APP] Error rate decreases to 5%
14:35:00 [APP] Full recovery, normal operation
\`\`\`
### Event Correlation
**Root Event**: Database connection pool exhaustion at 14:29:45
**Contributing Factors**:
- High traffic spike (+300% at 14:28:00)
- Long-running queries (>5s queries detected)
- Insufficient connection pool size (max: 20)
**Cascading Effects**:
- API timeouts (starting 14:29:30)
- Cache misses due to timeouts
- Increased load from retries
- Circuit breaker activation
## Request Tracing
### Example Failed Request
**Request ID**: req_abc123def456
**Timeline**:
\`\`\`
14:30:15.123 [NGINX] Request received: POST /api/orders
14:30:15.125 [APP] Request processing started
14:30:15.130 [APP] Database query started: SELECT orders...
14:30:20.131 [DB] Query timeout after 5s
14:30:20.135 [APP] Error: Database timeout
14:30:20.137 [APP] Response: 500 Internal Server Error
14:30:20.140 [NGINX] Response sent (5017ms)
\`\`\`
**User Impact**: Order creation failed for user_123
## Anomalies Detected
### Anomaly 1: Error Rate Spike
- **Time**: 14:30:00 - 14:35:00
- **Severity**: High
- **Details**: Error rate jumped from 0.1% to 25%
- **Affected Users**: ~500 users
- **Root Cause**: Database connection pool exhaustion
### Anomaly 2: New Error Type
- **Error**: "ConnectionPoolExhausted"
- **First Seen**: 14:29:45
- **Frequency**: 1,234 occurrences in 5 minutes
- **Status**: Previously unseen in baseline
## Performance Analysis
### Response Time Statistics
- **Average**: 150ms (baseline: 50ms)
- **P50**: 80ms
- **P95**: 500ms
- **P99**: 2000ms
- **Max**: 5000ms
### Slowest Endpoints
1. `/api/orders` - avg 450ms (1,200 requests)
2. `/api/users/profile` - avg 380ms (800 requests)
3. `/api/reports` - avg 320ms (200 requests)
### Database Performance
- **Slow Queries**: 45 queries >1s
- **Slowest Query**: 5.2s (SELECT with missing index)
- **Average Query Time**: 85ms (baseline: 25ms)
## User Impact
### Affected Users
- **Total Affected**: ~500 users
- **Error Rate by User Type**:
- Premium users: 5% error rate
- Free users: 30% error rate
- **Most Affected User**: user_789 (25 errors)
### Failed Operations
- **Order Creation**: 234 failures
- **Payment Processing**: 89 failures
- **Profile Updates**: 45 failures
## Resource Analysis
### Connection Pool
- **Max Size**: 20 connections
- **Peak Usage**: 20/20 (100%)
- **Average Wait Time**: 2.5s
- **Recommendation**: Increase to 50 connections
### Memory Usage
- **Average**: 450MB
- **Peak**: 890MB
- **Trend**: Stable (no leak detected)
## Security Findings
### Authentication
- **Failed Logins**: 12
- **Suspicious IPs**: 2 IPs with >5 failed attempts
- **Brute Force Attempts**: None detected
### Attack Patterns
- **SQL Injection Attempts**: 0
- **XSS Attempts**: 0
- **Path Traversal**: 0
## Root Cause Analysis
Based on log analysis:
**Primary Cause**: Database connection pool too small for traffic volume
**Contributing Factors**:
1. Traffic spike (+300%)
2. Slow queries consuming connections
3. No connection timeout configured
**Evidence**:
- Connection pool exhausted at 14:29:45
- Immediate correlation with error spike
- Recovery after auto-scaling added capacity
## Recommendations
### Immediate Actions
1. Increase database connection pool to 50
2. Add connection timeout (30s)
3. Optimize slow queries identified
### Monitoring Improvements
1. Alert on connection pool usage >80%
2. Track query duration P95
3. Monitor error rate per endpoint
### Code Changes
1. Add query timeouts to all database calls
2. Implement connection retry logic
3. Add circuit breaker for database calls
## Next Steps
1. **Fix**: Use `/debug fix` to implement connection pool increase
2. **Performance**: Use `/debug performance` to optimize slow queries
3. **Monitoring**: Add alerts for connection pool usage
## Appendices
### A. Full Error Log Excerpt
\`\`\`
[Relevant log excerpts]
\`\`\`
### B. Query Performance Data
\`\`\`sql
[Slow query details]
\`\`\`
### C. Traffic Pattern Graph
\`\`\`
[ASCII graph or description of traffic pattern]
\`\`\`
```
## Error Handling
**Logs Not Found**:
If specified log files don't exist:
1. List available log files
2. Suggest alternative log locations
3. Provide commands to locate logs
**Logs Too Large**:
If logs are too large to analyze:
1. Focus on most recent data
2. Use sampling techniques
3. Analyze specific time windows
4. Suggest log aggregation tools
**Insufficient Context**:
If logs lack necessary information:
1. Document what information is missing
2. Suggest additional logging
3. Recommend structured logging format
4. Propose log enrichment strategies
## Integration with Other Operations
- **Before**: Use `/debug diagnose` to identify time period to analyze
- **After**: Use `/debug fix` to address issues found in logs
- **Related**: Use `/debug performance` for performance issues
- **Related**: Use `/debug reproduce` to recreate issues found in logs
## Agent Utilization
This operation leverages the **10x-fullstack-engineer** agent for:
- Pattern recognition across large log volumes
- Correlating events across multiple log sources
- Statistical analysis and anomaly detection
- Root cause inference from log patterns
- Actionable recommendations based on findings

759
commands/debug/diagnose.md Normal file
View File

@@ -0,0 +1,759 @@
# Diagnose Operation - Comprehensive Diagnosis and Root Cause Analysis
You are executing the **diagnose** operation to perform comprehensive diagnosis and root cause analysis for complex issues spanning multiple layers of the application stack.
## Parameters
**Received**: `$ARGUMENTS` (after removing 'diagnose' operation name)
Expected format: `issue:"problem description" [environment:"prod|staging|dev"] [logs:"log-location"] [reproduction:"steps"] [impact:"severity"]`
## Workflow
### 1. Issue Understanding
Gather and analyze comprehensive information about the issue:
**Information to Collect**:
- **Symptom**: What is the observable problem? What exactly is failing?
- **Impact**: Who is affected? How many users? Business impact?
- **Frequency**: Consistent, intermittent, or rare? Percentage of occurrences?
- **Environment**: Production, staging, or development? Specific regions/zones?
- **Timeline**: When did it start? Any correlation with deployments?
- **Recent Changes**: Deployments, config changes, infrastructure changes?
- **Error Messages**: Complete error messages, stack traces, error codes
**Questions to Answer**:
```markdown
- What is the user experiencing?
- What should be happening instead?
- How widespread is the issue?
- Is it getting worse over time?
- Are there any patterns (time of day, user types, specific actions)?
```
### 2. Data Collection Across All Layers
Systematically collect diagnostic data from each layer of the stack:
#### Frontend Diagnostics
**Browser Console Analysis**:
```javascript
// Check for JavaScript errors
console.error logs
console.warn logs
// Inspect unhandled promise rejections
window.addEventListener('unhandledrejection', event => {
console.error('Unhandled promise rejection:', event.reason);
});
// Check for resource loading failures
performance.getEntriesByType('resource').filter(r => r.transferSize === 0)
```
**Network Request Analysis**:
```javascript
// Analyze failed requests
// Open DevTools > Network tab
// Filter: Status code 4xx, 5xx
// Check: Request headers, payload, response body, timing
// Performance timing
const perfEntries = performance.getEntriesByType('navigation')[0];
console.log('DNS lookup:', perfEntries.domainLookupEnd - perfEntries.domainLookupStart);
console.log('TCP connection:', perfEntries.connectEnd - perfEntries.connectStart);
console.log('Request time:', perfEntries.responseStart - perfEntries.requestStart);
console.log('Response time:', perfEntries.responseEnd - perfEntries.responseStart);
```
**State Inspection**:
```javascript
// React DevTools: Component state at error time
// Redux DevTools: Action history, state snapshots
// Vue DevTools: Vuex state, component hierarchy
// Add error boundary to capture React errors
class ErrorBoundary extends React.Component {
componentDidCatch(error, errorInfo) {
console.error('Component error:', {
error: error.toString(),
componentStack: errorInfo.componentStack,
currentState: this.props.reduxState
});
}
}
```
#### Backend Diagnostics
**Application Logs**:
```bash
# Real-time application logs
tail -f logs/application.log
# Error logs with context
grep -i "error\|exception\|fatal" logs/*.log -A 10 -B 5
# Filter by request ID to trace single request
grep "request-id-12345" logs/*.log
# Find patterns in errors
awk '/ERROR/ {print $0}' logs/application.log | sort | uniq -c | sort -rn
# Time-based analysis
grep "2024-10-14 14:" logs/application.log | grep ERROR
```
**System Logs**:
```bash
# Service logs (systemd)
journalctl -u application-service.service -f
journalctl -u application-service.service --since "1 hour ago"
# Syslog
tail -f /var/log/syslog | grep application
# Kernel logs (for system-level issues)
dmesg -T | tail -50
```
**Application Metrics**:
```bash
# Request rate and response times
# Check APM tools: New Relic, Datadog, Elastic APM
# HTTP response codes over time
awk '{print $9}' /var/log/nginx/access.log | sort | uniq -c
# Slow requests
awk '$10 > 1000 {print $0}' /var/log/nginx/access.log
# Error rate calculation
errors=$(grep -c "ERROR" logs/application.log)
total=$(wc -l < logs/application.log)
echo "Error rate: $(echo "scale=4; $errors / $total * 100" | bc)%"
```
#### Database Diagnostics
**Active Queries and Locks**:
```sql
-- PostgreSQL: Active queries
SELECT
pid,
now() - query_start AS duration,
state,
query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;
-- Long-running queries
SELECT
pid,
now() - query_start AS duration,
query
FROM pg_stat_activity
WHERE state = 'active'
AND now() - query_start > interval '1 minute';
-- Blocking queries
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_statement,
blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks
ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;
-- Deadlock information (from logs)
-- Look for "deadlock detected" in PostgreSQL logs
```
**Database Performance**:
```sql
-- Table statistics
SELECT
schemaname,
tablename,
n_live_tup AS live_rows,
n_dead_tup AS dead_rows,
last_vacuum,
last_autovacuum
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC;
-- Index usage
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan ASC;
-- Connection count
SELECT
count(*) AS connections,
state,
usename
FROM pg_stat_activity
GROUP BY state, usename;
-- Cache hit ratio
SELECT
sum(heap_blks_read) AS heap_read,
sum(heap_blks_hit) AS heap_hit,
sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) AS cache_hit_ratio
FROM pg_statio_user_tables;
```
**Slow Query Log Analysis**:
```bash
# PostgreSQL: Enable log_min_duration_statement
# Check postgresql.conf: log_min_duration_statement = 1000 (1 second)
# Analyze slow queries
grep "duration:" /var/log/postgresql/postgresql.log | awk '{print $3, $6}' | sort -rn | head -20
```
#### Infrastructure Diagnostics
**Resource Usage**:
```bash
# CPU usage
top -bn1 | head -20
mpstat 1 5 # CPU stats every 1 second, 5 times
# Memory usage
free -h
vmstat 1 5
# Disk I/O
iostat -x 1 5
iotop -o # Only show processes doing I/O
# Disk space
df -h
du -sh /* | sort -rh | head -10
# Network connections
netstat -an | grep ESTABLISHED | wc -l
ss -s # Socket statistics
# Open files
lsof | wc -l
lsof -u application-user | wc -l
```
**Container Diagnostics (Docker/Kubernetes)**:
```bash
# Docker container logs
docker logs container-name --tail 100 -f
docker stats container-name
# Docker container inspection
docker inspect container-name
docker exec container-name ps aux
docker exec container-name df -h
# Kubernetes pod logs
kubectl logs pod-name -f
kubectl logs pod-name --previous # Previous container logs
# Kubernetes pod resource usage
kubectl top pods
kubectl describe pod pod-name
# Kubernetes events
kubectl get events --sort-by='.lastTimestamp'
```
**Cloud Provider Metrics**:
```bash
# AWS CloudWatch
aws cloudwatch get-metric-statistics \
--namespace AWS/EC2 \
--metric-name CPUUtilization \
--dimensions Name=InstanceId,Value=i-1234567890abcdef0 \
--start-time 2024-10-14T00:00:00Z \
--end-time 2024-10-14T23:59:59Z \
--period 3600 \
--statistics Average
# Check application logs
aws logs tail /aws/application/logs --follow
# GCP Stackdriver
gcloud logging read "resource.type=gce_instance AND severity>=ERROR" --limit 50
# Azure Monitor
az monitor metrics list --resource <resource-id> --metric "Percentage CPU"
```
### 3. Hypothesis Formation
Based on collected data, form testable hypotheses about the root cause:
**Common Issue Patterns to Consider**:
#### Race Conditions
**Symptoms**:
- Intermittent failures
- Works sometimes, fails other times
- Timing-dependent behavior
- "Cannot read property of undefined" on objects that should exist
**What to Check**:
```javascript
// Look for async operations without proper waiting
async function problematic() {
let data;
fetchData().then(result => data = result); // ❌ Race condition
return processData(data); // May execute before data is set
}
// Proper async/await
async function correct() {
const data = await fetchData(); // ✅ Wait for data
return processData(data);
}
// Multiple parallel operations
Promise.all([op1(), op2(), op3()]) // Check for interdependencies
```
#### Memory Leaks
**Symptoms**:
- Degrading performance over time
- Increasing memory usage
- Eventually crashes with OOM errors
- Slow garbage collection
**What to Check**:
```javascript
// Event listeners not removed
componentDidMount() {
window.addEventListener('resize', this.handleResize);
// ❌ Missing removeEventListener in componentWillUnmount
}
// Closures holding references
function createLeak() {
const largeData = new Array(1000000);
return () => console.log(largeData[0]); // Holds entire array
}
// Timers not cleared
setInterval(() => fetchData(), 1000); // ❌ Never cleared
// Cache without eviction
const cache = {};
cache[key] = value; // ❌ Grows indefinitely
```
#### Database Issues
**Symptoms**:
- Slow queries
- Timeouts
- Deadlocks
- Connection pool exhausted
**What to Check**:
```sql
-- Missing indexes
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'user@example.com';
-- Look for "Seq Scan" on large tables
-- N+1 queries
-- Check if ORM is making one query per item in a loop
-- Long transactions
-- Find transactions open for extended periods
-- Lock contention
-- Check for blocking queries and deadlocks
```
#### Network Issues
**Symptoms**:
- Timeouts
- Intermittent connectivity
- DNS resolution failures
- SSL/TLS handshake errors
**What to Check**:
```bash
# DNS resolution
dig api.example.com
nslookup api.example.com
# Network latency
ping api.example.com
traceroute api.example.com
# TCP connection
telnet api.example.com 443
nc -zv api.example.com 443
# SSL/TLS verification
openssl s_client -connect api.example.com:443 -servername api.example.com
```
#### Authentication/Authorization
**Symptoms**:
- 401 Unauthorized errors
- 403 Forbidden errors
- Intermittent authentication failures
- Session expired errors
**What to Check**:
```javascript
// Token expiration
const token = jwt.decode(authToken);
console.log('Token expires:', new Date(token.exp * 1000));
// Session state
console.log('Session:', sessionStorage, localStorage);
// Cookie issues
console.log('Cookies:', document.cookie);
// CORS issues (browser console)
// Look for: "CORS policy: No 'Access-Control-Allow-Origin' header"
```
#### Configuration Issues
**Symptoms**:
- Works locally, fails in environment
- "Environment variable not set" errors
- Connection refused errors
- Permission denied errors
**What to Check**:
```bash
# Environment variables
printenv | grep APPLICATION
env | sort
# Configuration files
cat config/production.json
diff config/development.json config/production.json
# File permissions
ls -la config/
ls -la /var/application/
# Network configuration
cat /etc/hosts
cat /etc/resolv.conf
```
### 4. Hypothesis Testing
Systematically test each hypothesis:
**Testing Strategy**:
1. **Isolation**: Test each component in isolation
2. **Instrumentation**: Add detailed logging around suspected areas
3. **Reproduction**: Create minimal reproduction case
4. **Elimination**: Rule out hypotheses systematically
**Add Diagnostic Instrumentation**:
```javascript
// Detailed logging with context
console.log('[DIAG] Before operation:', {
timestamp: new Date().toISOString(),
user: currentUser,
state: JSON.stringify(currentState),
params: params
});
try {
const result = await operation(params);
console.log('[DIAG] Operation success:', {
timestamp: new Date().toISOString(),
result: result,
duration: Date.now() - startTime
});
} catch (error) {
console.error('[DIAG] Operation failed:', {
timestamp: new Date().toISOString(),
error: error.message,
stack: error.stack,
context: { user, state, params }
});
throw error;
}
// Performance timing
console.time('operation');
await operation();
console.timeEnd('operation');
// Memory usage tracking
if (global.gc) {
global.gc();
const usage = process.memoryUsage();
console.log('[MEMORY]', {
heapUsed: Math.round(usage.heapUsed / 1024 / 1024) + 'MB',
heapTotal: Math.round(usage.heapTotal / 1024 / 1024) + 'MB',
external: Math.round(usage.external / 1024 / 1024) + 'MB'
});
}
```
**Binary Search Debugging**:
```javascript
// Comment out half the code
// Determine which half has the bug
// Repeat until isolated
// Example: Large function with error
function complexOperation() {
// Part 1: Data fetching
const data = fetchData();
// Part 2: Data processing
const processed = processData(data);
// Part 3: Data validation
const validated = validateData(processed);
// Part 4: Data saving
return saveData(validated);
}
// Test each part independently
const data = fetchData();
console.log('[TEST] Data fetched:', data); // ✅ Works
const processed = processData(testData);
console.log('[TEST] Data processed:', processed); // ❌ Fails here
// Now investigate processData() specifically
```
### 5. Root Cause Identification
Once hypotheses are tested and narrowed down:
**Confirm Root Cause**:
1. Can you consistently reproduce the issue?
2. Does fixing this cause resolve the symptom?
3. Are there other instances of the same issue?
4. Does the fix have any side effects?
**Document Evidence**:
- Specific code/config that causes the issue
- Exact conditions required for issue to manifest
- Why this causes the observed symptom
- Related code that might have same issue
### 6. Impact Assessment
Evaluate the full impact:
**User Impact**:
- Number of users affected
- Severity of impact (blocking, degraded, minor)
- User actions affected
- Business metrics impacted
**System Impact**:
- Performance degradation
- Resource consumption
- Downstream service effects
- Data integrity concerns
**Risk Assessment**:
- Can it cause data loss?
- Can it cause security issues?
- Can it cause cascading failures?
- Is it getting worse over time?
## Output Format
```markdown
# Diagnosis Report: [Issue Summary]
## Executive Summary
[One-paragraph summary of issue, root cause, and recommended action]
## Issue Description
### Symptoms
- [Observable symptom 1]
- [Observable symptom 2]
- [Observable symptom 3]
### Impact
- **Affected Users**: [number/percentage of users]
- **Severity**: [critical|high|medium|low]
- **Frequency**: [always|often|sometimes|rarely - with percentage]
- **Business Impact**: [revenue loss, user experience, etc.]
### Environment
- **Environment**: [production|staging|development]
- **Version**: [application version]
- **Infrastructure**: [relevant infrastructure details]
- **Region**: [if applicable]
### Timeline
- **First Observed**: [date/time]
- **Recent Changes**: [deployments, config changes]
- **Pattern**: [time-based, load-based, user-based]
## Diagnostic Data Collected
### Frontend Analysis
[Console errors, network requests, performance data, state inspection results]
### Backend Analysis
[Application logs, error traces, system metrics, request patterns]
### Database Analysis
[Query logs, lock information, performance metrics, connection pool status]
### Infrastructure Analysis
[Resource usage, container logs, cloud metrics, network diagnostics]
## Hypothesis Analysis
### Hypotheses Considered
1. **[Hypothesis 1]**: [Description]
- **Evidence For**: [supporting evidence]
- **Evidence Against**: [contradicting evidence]
- **Conclusion**: [Ruled out|Confirmed|Needs more investigation]
2. **[Hypothesis 2]**: [Description]
- **Evidence For**: [supporting evidence]
- **Evidence Against**: [contradicting evidence]
- **Conclusion**: [Ruled out|Confirmed|Needs more investigation]
3. **[Hypothesis 3]**: [Description]
- **Evidence For**: [supporting evidence]
- **Evidence Against**: [contradicting evidence]
- **Conclusion**: [Ruled out|Confirmed|Needs more investigation]
## Root Cause
### Root Cause Identified
[Detailed explanation of the root cause with specific code/config references]
### Why It Causes the Symptom
[Technical explanation of how the root cause leads to the observed behavior]
### Why It Wasn't Caught Earlier
[Explanation of why tests/monitoring didn't catch this]
### Related Issues
[Any similar issues that might exist or could be fixed with similar approach]
## Evidence
### Code/Configuration
```[language]
[Specific code or configuration causing the issue]
```
### Reproduction
[Exact steps to reproduce the issue consistently]
### Verification
[Steps taken to confirm this is the root cause]
## Recommended Actions
### Immediate Actions
1. [Immediate action 1 - e.g., rollback, circuit breaker]
2. [Immediate action 2]
### Permanent Fix
[Description of the permanent fix needed]
### Prevention
- **Monitoring**: [What monitoring to add]
- **Testing**: [What tests to add]
- **Code Review**: [What to look for in code reviews]
- **Documentation**: [What to document]
## Next Steps
1. **Fix Implementation**: [Use /debug fix operation]
2. **Verification**: [Testing strategy]
3. **Deployment**: [Rollout plan]
4. **Monitoring**: [What to watch]
## Appendices
### A. Detailed Logs
[Relevant log excerpts with context]
### B. Metrics and Graphs
[Performance metrics, error rates, resource usage]
### C. Related Tickets
[Links to related issues or tickets]
```
## Error Handling
**Insufficient Information**:
If diagnosis cannot be completed due to missing information:
1. List specific information needed
2. Explain why each piece is important
3. Provide instructions for gathering data
4. Suggest interim monitoring
**Cannot Reproduce**:
If issue cannot be reproduced:
1. Document reproduction attempts
2. Request more detailed reproduction steps
3. Suggest environment comparison
4. Propose production debugging approach
**Multiple Root Causes**:
If multiple root causes are identified:
1. Prioritize by impact
2. Explain interdependencies
3. Provide fix sequence
4. Suggest monitoring between fixes
## Integration with Other Operations
After diagnosis is complete:
- **For fixes**: Use `/debug fix` with identified root cause
- **For reproduction**: Use `/debug reproduce` to create reliable test case
- **For log analysis**: Use `/debug analyze-logs` for deeper log investigation
- **For performance**: Use `/debug performance` if performance-related
- **For memory**: Use `/debug memory` if memory-related
## Agent Utilization
This operation leverages the **10x-fullstack-engineer** agent for:
- Systematic cross-layer analysis
- Pattern recognition across stack
- Hypothesis formation and testing
- Production debugging expertise
- Prevention-focused thinking

967
commands/debug/fix.md Normal file
View File

@@ -0,0 +1,967 @@
# Fix Operation - Targeted Fix Implementation
You are executing the **fix** operation to implement targeted fixes with comprehensive verification and prevention measures.
## Parameters
**Received**: `$ARGUMENTS` (after removing 'fix' operation name)
Expected format: `issue:"problem description" root_cause:"identified-cause" [verification:"test-strategy"] [scope:"affected-areas"] [rollback:"rollback-plan"]`
## Workflow
### 1. Understand the Fix Requirements
Clarify what needs to be fixed and constraints:
**Key Information**:
- **Root Cause**: Exact cause to address (from diagnosis)
- **Scope**: What code/config/infrastructure needs changing
- **Constraints**: Performance, backwards compatibility, security
- **Verification**: How to verify the fix works
- **Rollback**: Plan if fix causes problems
**Fix Strategy Questions**:
```markdown
- Is this a code fix, configuration fix, or infrastructure fix?
- Are there multiple ways to fix this? Which is best?
- What are the side effects of the fix?
- Can we fix just the symptom or must we fix the root cause?
- Is there existing code doing this correctly we can learn from?
- What is the blast radius if the fix goes wrong?
```
### 2. Design the Fix
Plan the implementation approach:
#### Fix Pattern Selection
**Code Fix Patterns**:
**1. Add Missing Error Handling**
```javascript
// Before (causes crashes)
async function processPayment(orderId) {
const order = await db.orders.findById(orderId);
return await paymentGateway.charge(order.amount);
}
// After (handles errors properly)
async function processPayment(orderId) {
try {
const order = await db.orders.findById(orderId);
if (!order) {
throw new Error(`Order ${orderId} not found`);
}
if (order.status !== 'pending') {
throw new Error(`Order ${orderId} is not in pending status`);
}
const result = await paymentGateway.charge(order.amount);
if (!result.success) {
throw new Error(`Payment failed: ${result.error}`);
}
return result;
} catch (error) {
logger.error('Payment processing failed', {
orderId,
error: error.message,
stack: error.stack
});
throw new PaymentError(`Failed to process payment for order ${orderId}`, error);
}
}
```
**2. Fix Race Condition**
```javascript
// Before (race condition)
let cache = null;
async function getData() {
if (!cache) {
cache = await fetchFromDatabase(); // Multiple concurrent calls
}
return cache;
}
// After (properly synchronized)
let cache = null;
let cachePromise = null;
async function getData() {
if (!cache) {
if (!cachePromise) {
cachePromise = fetchFromDatabase();
}
cache = await cachePromise;
cachePromise = null;
}
return cache;
}
// Or use a proper caching library
const { promiseMemoize } = require('promise-memoize');
const getData = promiseMemoize(async () => {
return await fetchFromDatabase();
}, { maxAge: 60000 });
```
**3. Fix Memory Leak**
```javascript
// Before (memory leak)
class Component extends React.Component {
componentDidMount() {
window.addEventListener('resize', this.handleResize);
this.interval = setInterval(this.fetchData, 5000);
}
// componentWillUnmount missing - listeners never removed
}
// After (properly cleaned up)
class Component extends React.Component {
componentDidMount() {
window.addEventListener('resize', this.handleResize);
this.interval = setInterval(this.fetchData, 5000);
}
componentWillUnmount() {
window.removeEventListener('resize', this.handleResize);
clearInterval(this.interval);
}
}
```
**4. Add Missing Validation**
```javascript
// Before (no validation)
app.post('/api/users', async (req, res) => {
const user = await db.users.create(req.body);
res.json(user);
});
// After (proper validation)
const { body, validationResult } = require('express-validator');
app.post('/api/users',
// Validation middleware
body('email').isEmail().normalizeEmail(),
body('password').isLength({ min: 8 }).matches(/[A-Z]/).matches(/[0-9]/),
body('age').optional().isInt({ min: 0, max: 150 }),
async (req, res) => {
// Check validation results
const errors = validationResult(req);
if (!errors.isEmpty()) {
return res.status(400).json({ errors: errors.array() });
}
try {
const user = await db.users.create({
email: req.body.email,
password: await hashPassword(req.body.password),
age: req.body.age
});
res.json(user);
} catch (error) {
logger.error('User creation failed', error);
res.status(500).json({ error: 'Failed to create user' });
}
}
);
```
**5. Fix N+1 Query Problem**
```javascript
// Before (N+1 queries)
async function getUsersWithOrders() {
const users = await db.users.findAll();
for (const user of users) {
user.orders = await db.orders.findByUserId(user.id); // N queries
}
return users;
}
// After (single query with join)
async function getUsersWithOrders() {
const users = await db.users.findAll({
include: [
{ model: db.orders, as: 'orders' }
]
});
return users;
}
// Or with eager loading
async function getUsersWithOrders() {
const users = await db.users.findAll();
const userIds = users.map(u => u.id);
const orders = await db.orders.findAll({
where: { userId: userIds }
});
// Group orders by userId
const ordersByUser = orders.reduce((acc, order) => {
if (!acc[order.userId]) acc[order.userId] = [];
acc[order.userId].push(order);
return acc;
}, {});
// Attach to users
users.forEach(user => {
user.orders = ordersByUser[user.id] || [];
});
return users;
}
```
**Configuration Fix Patterns**:
**1. Fix Missing Environment Variable**
```bash
# Before (hardcoded)
DATABASE_URL=postgresql://localhost/myapp
# After (environment-specific)
# .env.production
DATABASE_URL=postgresql://prod-db.example.com:5432/myapp_prod?sslmode=require
# Application code should validate required vars
const requiredEnvVars = ['DATABASE_URL', 'API_KEY', 'SECRET_KEY'];
for (const envVar of requiredEnvVars) {
if (!process.env[envVar]) {
throw new Error(`Required environment variable ${envVar} is not set`);
}
}
```
**2. Fix Resource Limits**
```yaml
# Before (no limits - causes OOM)
apiVersion: apps/v1
kind: Deployment
spec:
containers:
- name: app
image: myapp:latest
# After (proper resource limits)
apiVersion: apps/v1
kind: Deployment
spec:
containers:
- name: app
image: myapp:latest
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
```
**Infrastructure Fix Patterns**:
**1. Fix Nginx Upload Size Limit**
```nginx
# Before (default 1MB limit)
server {
listen 80;
server_name example.com;
location / {
proxy_pass http://localhost:3000;
}
}
# After (increased limit)
server {
listen 80;
server_name example.com;
# Increase max body size
client_max_body_size 50M;
location / {
proxy_pass http://localhost:3000;
# Increase timeouts for large uploads
proxy_read_timeout 300s;
proxy_connect_timeout 75s;
}
}
```
**2. Add Missing Database Index**
```sql
-- Before (slow query)
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'user@example.com';
-- Seq Scan on users (cost=0.00..1234.56 rows=1 width=123) (actual time=45.123..45.124 rows=1 loops=1)
-- After (add index)
CREATE INDEX idx_users_email ON users(email);
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'user@example.com';
-- Index Scan using idx_users_email on users (cost=0.29..8.30 rows=1 width=123) (actual time=0.012..0.013 rows=1 loops=1)
```
### 3. Implement the Fix
Execute the implementation with safety measures:
#### Implementation Checklist
**Pre-Implementation**:
- [ ] Create feature branch from main
- [ ] Review related code for similar issues
- [ ] Identify all affected areas
- [ ] Plan rollback strategy
- [ ] Prepare monitoring queries
**During Implementation**:
```bash
# Create feature branch
git checkout -b fix/issue-description
# Make changes incrementally
# Test after each change
# Commit with clear messages
git add file1.js
git commit -m "fix: add error handling to payment processing"
git add file2.js
git commit -m "fix: add validation for order status"
```
**Code Changes with Safety**:
```javascript
// Add defensive checks
function processOrder(order) {
// Validate inputs
if (!order) {
throw new Error('Order is required');
}
if (!order.id) {
throw new Error('Order must have an id');
}
// Log for debugging
logger.debug('Processing order', { orderId: order.id });
try {
// Main logic
const result = doProcessing(order);
// Validate output
if (!result || !result.success) {
throw new Error('Processing did not return success');
}
return result;
} catch (error) {
// Enhanced error context
logger.error('Order processing failed', {
orderId: order.id,
error: error.message,
stack: error.stack
});
// Re-throw with context
throw new ProcessingError(`Failed to process order ${order.id}`, error);
}
}
```
**Configuration Changes with Rollback**:
```bash
# Backup current config
cp /etc/nginx/nginx.conf /etc/nginx/nginx.conf.backup.$(date +%Y%m%d)
# Make changes
sudo vim /etc/nginx/nginx.conf
# Test configuration before applying
sudo nginx -t
# If test passes, reload
sudo nginx -s reload
# If issues occur, rollback
# sudo cp /etc/nginx/nginx.conf.backup.YYYYMMDD /etc/nginx/nginx.conf
# sudo nginx -s reload
```
**Database Changes with Safety**:
```sql
-- Start transaction
BEGIN;
-- Create index concurrently (doesn't lock table)
CREATE INDEX CONCURRENTLY idx_users_email ON users(email);
-- Verify index was created
\d users
-- Test query with new index
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'test@example.com';
-- If all looks good, commit
COMMIT;
-- If issues, rollback
-- ROLLBACK;
-- DROP INDEX idx_users_email;
```
### 4. Add Safeguards
Implement safeguards to prevent recurrence:
**Safeguard Types**:
**1. Input Validation**
```javascript
// Add schema validation
const Joi = require('joi');
const orderSchema = Joi.object({
id: Joi.string().uuid().required(),
userId: Joi.string().uuid().required(),
amount: Joi.number().positive().required(),
currency: Joi.string().length(3).required(),
status: Joi.string().valid('pending', 'processing', 'completed', 'failed').required()
});
function validateOrder(order) {
const { error, value } = orderSchema.validate(order);
if (error) {
throw new ValidationError(`Invalid order: ${error.message}`);
}
return value;
}
```
**2. Rate Limiting**
```javascript
const rateLimit = require('express-rate-limit');
// Prevent abuse
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
max: 100, // limit each IP to 100 requests per windowMs
message: 'Too many requests from this IP'
});
app.use('/api/', limiter);
```
**3. Circuit Breaker**
```javascript
const CircuitBreaker = require('opossum');
// Protect against cascading failures
const breaker = new CircuitBreaker(externalApiCall, {
timeout: 3000, // 3 seconds
errorThresholdPercentage: 50,
resetTimeout: 30000 // 30 seconds
});
breaker.fallback(() => {
return { cached: true, data: getCachedData() };
});
async function callExternalApi(params) {
return await breaker.fire(params);
}
```
**4. Retry Logic**
```javascript
const retry = require('async-retry');
async function robustApiCall(params) {
return await retry(
async (bail) => {
try {
return await apiCall(params);
} catch (error) {
// Don't retry client errors
if (error.statusCode >= 400 && error.statusCode < 500) {
bail(error);
return;
}
// Retry server errors
throw error;
}
},
{
retries: 3,
minTimeout: 1000,
maxTimeout: 5000,
factor: 2
}
);
}
```
**5. Graceful Degradation**
```javascript
async function getRecommendations(userId) {
try {
// Try ML-based recommendations
return await mlRecommendationService.getRecommendations(userId);
} catch (error) {
logger.warn('ML recommendations failed, falling back to rule-based', error);
try {
// Fallback to rule-based
return await ruleBasedRecommendations(userId);
} catch (error2) {
logger.error('All recommendation methods failed', error2);
// Final fallback to popular items
return await getPopularItems();
}
}
}
```
### 5. Verification
Thoroughly verify the fix works:
**Verification Levels**:
**Level 1: Unit Tests**
```javascript
describe('processPayment', () => {
it('should handle missing order gracefully', async () => {
await expect(processPayment('nonexistent-id'))
.rejects
.toThrow('Order nonexistent-id not found');
});
it('should reject orders not in pending status', async () => {
const completedOrder = await createTestOrder({ status: 'completed' });
await expect(processPayment(completedOrder.id))
.rejects
.toThrow('is not in pending status');
});
it('should process valid pending orders', async () => {
const order = await createTestOrder({ status: 'pending', amount: 100 });
const result = await processPayment(order.id);
expect(result.success).toBe(true);
expect(result.transactionId).toBeDefined();
});
});
```
**Level 2: Integration Tests**
```javascript
describe('Payment Integration', () => {
it('should handle full payment flow', async () => {
// Create order
const order = await createOrder({ amount: 100 });
expect(order.status).toBe('pending');
// Process payment
const result = await processPayment(order.id);
expect(result.success).toBe(true);
// Verify order updated
const updatedOrder = await getOrder(order.id);
expect(updatedOrder.status).toBe('completed');
// Verify transaction recorded
const transaction = await getTransaction(result.transactionId);
expect(transaction.orderId).toBe(order.id);
});
});
```
**Level 3: Manual Testing**
```bash
# Test the fix manually
npm start
# In another terminal, reproduce the original issue
curl -X POST http://localhost:3000/api/orders/12345/payment
# Verify fix
# - Check response is successful
# - Check logs for proper error handling
# - Check database state is consistent
```
**Level 4: Load Testing**
```javascript
// Use k6 for load testing
import http from 'k6/http';
import { check, sleep } from 'k6';
export let options = {
stages: [
{ duration: '2m', target: 100 }, // Ramp up to 100 users
{ duration: '5m', target: 100 }, // Stay at 100 users
{ duration: '2m', target: 0 }, // Ramp down
],
};
export default function () {
let response = http.post('http://localhost:3000/api/orders/payment', {
orderId: '12345'
});
check(response, {
'status is 200': (r) => r.status === 200,
'no errors': (r) => !r.json('error')
});
sleep(1);
}
```
**Level 5: Production Smoke Test**
```bash
# After deployment, test in production
# Use feature flag if possible
# Test with low traffic
curl https://api.production.com/health
curl https://api.production.com/api/test-endpoint
# Monitor metrics
# - Error rate
# - Response time
# - Resource usage
# If issues detected, rollback immediately
```
### 6. Prevention Measures
Add measures to prevent similar issues:
**Prevention Strategies**:
**1. Add Regression Tests**
```javascript
// This test would have caught the bug
describe('Regression: Order Processing Bug #1234', () => {
it('should not crash when order is missing', async () => {
// This used to cause a crash
await expect(processPayment('missing-order'))
.rejects
.toThrow('Order missing-order not found');
// No crash, proper error thrown
});
});
```
**2. Add Monitoring**
```javascript
// Add custom metrics
const { Counter, Histogram } = require('prom-client');
const paymentErrors = new Counter({
name: 'payment_processing_errors_total',
help: 'Total payment processing errors',
labelNames: ['error_type']
});
const paymentDuration = new Histogram({
name: 'payment_processing_duration_seconds',
help: 'Payment processing duration'
});
async function processPayment(orderId) {
const end = paymentDuration.startTimer();
try {
const result = await _processPayment(orderId);
end({ status: 'success' });
return result;
} catch (error) {
paymentErrors.inc({ error_type: error.constructor.name });
end({ status: 'error' });
throw error;
}
}
```
**3. Add Alerting**
```yaml
# Prometheus alert rules
groups:
- name: payment_processing
rules:
- alert: HighPaymentErrorRate
expr: rate(payment_processing_errors_total[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High payment error rate detected"
description: "Payment error rate is {{ $value }} errors/sec"
```
**4. Improve Logging**
```javascript
// Add structured logging
logger.info('Processing payment', {
orderId: order.id,
amount: order.amount,
userId: order.userId,
timestamp: new Date().toISOString()
});
// Log key decision points
logger.debug('Order validation passed', { orderId });
logger.debug('Calling payment gateway', { orderId, amount });
logger.debug('Payment gateway responded', { orderId, success: result.success });
```
**5. Update Documentation**
```markdown
# Common Issues and Solutions
## Issue: Payment Processing Fails Silently
**Symptoms**: Orders stuck in pending status
**Root Cause**: Missing error handling in payment processor
**Solution**: Added comprehensive error handling and logging
**Prevention**:
- All payment operations now have try-catch blocks
- Errors are logged with full context
- Alerts trigger on error rate > 10%
**Related Code**: src/services/payment-processor.js
**Tests**: tests/integration/payment-processing.test.js
**Monitoring**: Grafana dashboard "Payment Processing"
```
## Output Format
```markdown
# Fix Report: [Issue Summary]
## Summary
[Brief description of the fix implemented]
## Root Cause Addressed
[Detailed explanation of what root cause this fix addresses]
## Changes Made
### Code Changes
#### File: [path/to/file1]
**Purpose**: [Why this file was changed]
\`\`\`[language]
// Before
[original code]
// After
[fixed code]
// Why this works
[explanation]
\`\`\`
#### File: [path/to/file2]
**Purpose**: [Why this file was changed]
\`\`\`[language]
[changes with before/after]
\`\`\`
### Configuration Changes
#### File: [config/file]
\`\`\`
[configuration changes]
\`\`\`
**Impact**: [What this configuration change affects]
### Infrastructure Changes
#### Component: [infrastructure component]
\`\`\`
[infrastructure changes]
\`\`\`
**Impact**: [What this infrastructure change affects]
## Safeguards Added
### Input Validation
[Validation added to prevent bad inputs]
### Error Handling
[Error handling added for failure scenarios]
### Rate Limiting
[Rate limiting or throttling added]
### Monitoring
[Monitoring/metrics added]
### Alerting
[Alerts configured]
## Verification Results
### Unit Tests
\`\`\`
[test results]
\`\`\`
**Status**: ✅ All tests passing
### Integration Tests
\`\`\`
[test results]
\`\`\`
**Status**: ✅ All tests passing
### Manual Testing
[Description of manual testing performed]
**Status**: ✅ Issue no longer reproduces
### Load Testing
[Results of load testing]
**Status**: ✅ Performs well under load
## Prevention Measures
### Tests Added
- [Test 1]: Prevents regression
- [Test 2]: Covers edge case
### Monitoring Added
- [Metric 1]: Tracks error rate
- [Metric 2]: Tracks performance
### Alerts Configured
- [Alert 1]: Fires when error rate exceeds threshold
- [Alert 2]: Fires when performance degrades
### Documentation Updated
- [Doc 1]: Troubleshooting guide
- [Doc 2]: Runbook for oncall
## Deployment Plan
### Pre-Deployment
1. [Step 1]
2. [Step 2]
### Deployment
1. [Step 1]
2. [Step 2]
### Post-Deployment
1. [Step 1 - monitoring]
2. [Step 2 - verification]
### Rollback Plan
\`\`\`bash
[commands to rollback if needed]
\`\`\`
## Verification Steps
### How to Verify the Fix
1. [Verification step 1]
2. [Verification step 2]
### Expected Behavior After Fix
[Description of expected behavior]
### Monitoring Queries
\`\`\`
[queries to monitor fix effectiveness]
\`\`\`
## Related Issues
### Similar Issues Fixed
- [Related issue 1]
- [Related issue 2]
### Potential Similar Issues
- [Potential issue 1 to check]
- [Potential issue 2 to check]
## Lessons Learned
[Key insights from implementing this fix]
## Files Modified
- [file1]
- [file2]
- [file3]
## Commits
\`\`\`
[git log output showing fix commits]
\`\`\`
```
## Error Handling
**Fix Fails Verification**:
If fix doesn't resolve the issue:
1. Re-examine root cause analysis
2. Check if multiple issues present
3. Verify fix was implemented correctly
4. Add more diagnostic logging
**Fix Causes New Issues**:
If fix introduces side effects:
1. Rollback immediately
2. Analyze side effect cause
3. Redesign fix to avoid side effect
4. Add tests for side effect scenario
**Cannot Deploy Fix**:
If deployment blocked:
1. Implement workaround if possible
2. Document deployment blockers
3. Create deployment plan to address blockers
4. Consider feature flag for gradual rollout
## Integration with Other Operations
- **Before**: Use `/debug diagnose` to identify root cause
- **Before**: Use `/debug reproduce` to create test case
- **After**: Use `/debug performance` if fix affects performance
- **After**: Use `/debug memory` if fix affects memory usage
## Agent Utilization
This operation leverages the **10x-fullstack-engineer** agent for:
- Designing robust fixes that address root causes
- Implementing comprehensive safeguards
- Creating thorough verification strategies
- Considering performance and security implications
- Planning prevention measures

1006
commands/debug/memory.md Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,965 @@
# Performance Operation - Performance Debugging and Profiling
You are executing the **performance** operation to debug performance issues, profile application behavior, and optimize system performance.
## Parameters
**Received**: `$ARGUMENTS` (after removing 'performance' operation name)
Expected format: `component:"component-name" [metric:"response-time|throughput|cpu|memory"] [threshold:"target-value"] [duration:"profile-duration"] [load:"concurrent-users"]`
## Workflow
### 1. Establish Performance Baseline
Measure current performance before optimization:
**Baseline Metrics to Capture**:
```bash
# Response time baseline
curl -w "@curl-format.txt" -o /dev/null -s http://localhost:3000/api/endpoint
# Create curl-format.txt
cat > curl-format.txt <<'EOF'
time_namelookup: %{time_namelookup}\n
time_connect: %{time_connect}\n
time_appconnect: %{time_appconnect}\n
time_pretransfer: %{time_pretransfer}\n
time_redirect: %{time_redirect}\n
time_starttransfer: %{time_starttransfer}\n
----------\n
time_total: %{time_total}\n
EOF
# Throughput baseline
ab -n 1000 -c 10 http://localhost:3000/api/endpoint
# Resource usage baseline
# CPU
mpstat 1 60 > baseline_cpu.txt
# Memory
free -m && ps aux --sort=-%mem | head -20
# Disk I/O
iostat -x 1 60 > baseline_io.txt
```
**Application Metrics**:
```javascript
// Add timing middleware
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = Date.now() - start;
console.log({
method: req.method,
path: req.path,
status: res.statusCode,
duration: duration,
timestamp: new Date().toISOString()
});
});
next();
});
// Track key operations
const startTime = Date.now();
await operation();
const duration = Date.now() - startTime;
metrics.histogram('operation_duration', duration);
```
### 2. Identify Performance Bottlenecks
Use profiling to find slow components:
#### Application Profiling
**Node.js Profiling**:
```bash
# CPU profiling
node --prof app.js
# Run load test
ab -n 10000 -c 100 http://localhost:3000/
# Stop app, process profile
node --prof-process isolate-*-v8.log > processed.txt
# Chrome DevTools profiling
node --inspect app.js
# Open chrome://inspect
# Click "Open dedicated DevTools for Node"
# Go to Profiler tab, start profiling
# Clinic.js for comprehensive profiling
npm install -g clinic
clinic doctor -- node app.js
# Run load test
clinic doctor --visualize-only PID.clinic-doctor
```
**Python Profiling**:
```python
import cProfile
import pstats
# Profile a function
cProfile.run('my_function()', 'profile_stats')
# Analyze results
p = pstats.Stats('profile_stats')
p.sort_stats('cumulative')
p.print_stats(20)
# Line profiler for detailed profiling
from line_profiler import LineProfiler
profiler = LineProfiler()
profiler.add_function(my_function)
profiler.run('my_function()')
profiler.print_stats()
# Memory profiling
from memory_profiler import profile
@profile
def my_function():
large_list = [i for i in range(1000000)]
return sum(large_list)
```
**Use profiling utility script**:
```bash
# Run comprehensive profiling
./commands/debug/.scripts/profile.sh \
--app node_app \
--duration 60 \
--endpoint http://localhost:3000/api/slow
# Output: CPU profile, memory profile, flamegraph
```
#### Database Profiling
**Query Performance**:
```sql
-- PostgreSQL: Enable query timing
\timing on
-- Analyze query plan
EXPLAIN ANALYZE
SELECT u.*, o.*
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.created_at > '2024-01-01';
-- Look for:
-- - Seq Scan (sequential scan - bad for large tables)
-- - High cost estimates
-- - Large number of rows processed
-- - Missing indexes
-- Check slow queries
SELECT
query,
calls,
total_time,
mean_time,
max_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 20;
-- Find missing indexes
SELECT
schemaname,
tablename,
seq_scan,
seq_tup_read,
idx_scan,
seq_tup_read / seq_scan AS avg_seq_read
FROM pg_stat_user_tables
WHERE seq_scan > 0
ORDER BY seq_tup_read DESC
LIMIT 20;
```
**Connection Pool Analysis**:
```javascript
// Monitor connection pool
pool.on('acquire', (client) => {
console.log('Client acquired:', {
poolSize: pool.totalCount,
idleCount: pool.idleCount,
waitingCount: pool.waitingCount
});
});
pool.on('remove', (client) => {
console.log('Client removed from pool');
});
// Check pool stats periodically
setInterval(() => {
console.log('Pool stats:', {
total: pool.totalCount,
idle: pool.idleCount,
waiting: pool.waitingCount
});
}, 10000);
```
#### Network Profiling
**API Call Analysis**:
```bash
# Trace network calls
strace -c -p PID # System call tracing
# Detailed network timing
tcpdump -i any -w capture.pcap port 3000
# Analyze with Wireshark
# HTTP request tracing
curl -w "@curl-format.txt" -v http://localhost:3000/api/endpoint
# Check DNS resolution
time nslookup api.example.com
# Check network latency
ping -c 10 api.example.com
```
**Browser Performance**:
```javascript
// Use Performance API
performance.mark('start-operation');
await operation();
performance.mark('end-operation');
performance.measure('operation', 'start-operation', 'end-operation');
const measure = performance.getEntriesByName('operation')[0];
console.log('Operation took:', measure.duration, 'ms');
// Navigation timing
const perfData = performance.getEntriesByType('navigation')[0];
console.log({
dns: perfData.domainLookupEnd - perfData.domainLookupStart,
tcp: perfData.connectEnd - perfData.connectStart,
ttfb: perfData.responseStart - perfData.requestStart,
download: perfData.responseEnd - perfData.responseStart,
domReady: perfData.domContentLoadedEventEnd - perfData.domContentLoadedEventStart,
load: perfData.loadEventEnd - perfData.loadEventStart
});
// Resource timing
performance.getEntriesByType('resource').forEach(resource => {
console.log(resource.name, resource.duration);
});
```
### 3. Analyze Bottlenecks
Understand why components are slow:
#### CPU Bottlenecks
**Identify CPU-intensive operations**:
```javascript
// Find CPU-heavy code
const { performance } = require('perf_hooks');
function analyzePerformance() {
const start = performance.now();
// Suspect operation
const result = expensiveOperation();
const duration = performance.now() - start;
if (duration > 100) { // More than 100ms
console.warn('CPU-intensive operation detected:', {
operation: 'expensiveOperation',
duration: duration
});
}
return result;
}
```
**Common CPU bottlenecks**:
- Complex regex operations
- Large array/object operations
- JSON parsing/stringifying large objects
- Synchronous file operations
- Cryptographic operations
- Image processing
**Solutions**:
```javascript
// Before: Synchronous blocking
const data = JSON.parse(largeJsonString);
// After: Async in worker thread
const { Worker } = require('worker_threads');
function parseJsonAsync(jsonString) {
return new Promise((resolve, reject) => {
const worker = new Worker(`
const { parentPort } = require('worker_threads');
parentPort.on('message', (data) => {
const parsed = JSON.parse(data);
parentPort.postMessage(parsed);
});
`, { eval: true });
worker.on('message', resolve);
worker.on('error', reject);
worker.postMessage(jsonString);
});
}
```
#### I/O Bottlenecks
**Identify I/O-bound operations**:
```javascript
// Monitor I/O operations
const fs = require('fs').promises;
async function monitoredFileRead(path) {
const start = Date.now();
try {
const data = await fs.readFile(path);
const duration = Date.now() - start;
console.log('File read:', { path, duration, size: data.length });
if (duration > 50) {
console.warn('Slow file read detected:', path);
}
return data;
} catch (error) {
console.error('File read failed:', { path, error });
throw error;
}
}
```
**Common I/O bottlenecks**:
- Multiple database queries in sequence (N+1 problem)
- Synchronous file operations
- External API calls in sequence
- Large file uploads/downloads
**Solutions**:
```javascript
// Before: Sequential queries (N+1)
const users = await User.findAll();
for (const user of users) {
user.posts = await Post.findByUserId(user.id); // N queries
}
// After: Single query with join
const users = await User.findAll({
include: [{ model: Post }]
});
// Before: Sequential API calls
const user = await fetchUser(userId);
const orders = await fetchOrders(userId);
const profile = await fetchProfile(userId);
// After: Parallel execution
const [user, orders, profile] = await Promise.all([
fetchUser(userId),
fetchOrders(userId),
fetchProfile(userId)
]);
```
#### Memory Bottlenecks
**Identify memory issues**:
```javascript
// Monitor memory usage
function logMemoryUsage(label) {
const usage = process.memoryUsage();
console.log(`[${label}] Memory:`, {
rss: Math.round(usage.rss / 1024 / 1024) + 'MB',
heapTotal: Math.round(usage.heapTotal / 1024 / 1024) + 'MB',
heapUsed: Math.round(usage.heapUsed / 1024 / 1024) + 'MB',
external: Math.round(usage.external / 1024 / 1024) + 'MB'
});
}
logMemoryUsage('before-operation');
await operation();
logMemoryUsage('after-operation');
```
**Common memory bottlenecks**:
- Loading large datasets into memory
- Caching without size limits
- Memory leaks (event listeners, closures)
- Large object allocations
**Solutions**:
```javascript
// Before: Load entire file into memory
const data = await fs.readFile('large-file.csv', 'utf8');
const lines = data.split('\n');
// After: Stream processing
const readline = require('readline');
const stream = fs.createReadStream('large-file.csv');
const rl = readline.createInterface({ input: stream });
for await (const line of rl) {
processLine(line); // Process one line at a time
}
// Before: Unbounded cache
const cache = {};
cache[key] = value; // Grows forever
// After: LRU cache with size limit
const LRU = require('lru-cache');
const cache = new LRU({
max: 1000, // Max items
maxSize: 50 * 1024 * 1024, // 50MB
sizeCalculation: (value) => JSON.stringify(value).length
});
```
### 4. Implement Optimizations
Apply targeted optimizations:
#### Query Optimization
**Add Indexes**:
```sql
-- Before: Slow query
EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 123;
-- Seq Scan on orders (cost=0.00..1234.56 rows=10 width=100) (actual time=45.123..45.456 rows=10 loops=1)
-- After: Add index
CREATE INDEX idx_orders_user_id ON orders(user_id);
EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 123;
-- Index Scan using idx_orders_user_id on orders (cost=0.29..8.30 rows=10 width=100) (actual time=0.012..0.015 rows=10 loops=1)
```
**Optimize Queries**:
```sql
-- Before: Inefficient
SELECT * FROM orders o
LEFT JOIN users u ON o.user_id = u.id
WHERE o.created_at > NOW() - INTERVAL '7 days';
-- After: Select only needed columns, add index
CREATE INDEX idx_orders_created_at ON orders(created_at);
SELECT o.id, o.amount, u.name
FROM orders o
INNER JOIN users u ON o.user_id = u.id
WHERE o.created_at > NOW() - INTERVAL '7 days';
```
#### Caching
**Application-level caching**:
```javascript
const cache = new Map();
async function getCachedData(key) {
// Check cache first
if (cache.has(key)) {
console.log('Cache hit:', key);
return cache.get(key);
}
// Cache miss - fetch from database
console.log('Cache miss:', key);
const data = await fetchFromDatabase(key);
// Store in cache
cache.set(key, data);
// Expire after 5 minutes
setTimeout(() => cache.delete(key), 5 * 60 * 1000);
return data;
}
// Redis caching
const redis = require('redis');
const client = redis.createClient();
async function getCachedDataRedis(key) {
// Try cache
const cached = await client.get(key);
if (cached) {
return JSON.parse(cached);
}
// Fetch and cache
const data = await fetchFromDatabase(key);
await client.setEx(key, 300, JSON.stringify(data)); // 5 min TTL
return data;
}
```
#### Code Optimization
**Optimize algorithms**:
```javascript
// Before: O(n²) - slow for large arrays
function findDuplicates(arr) {
const duplicates = [];
for (let i = 0; i < arr.length; i++) {
for (let j = i + 1; j < arr.length; j++) {
if (arr[i] === arr[j]) {
duplicates.push(arr[i]);
}
}
}
return duplicates;
}
// After: O(n) - much faster
function findDuplicates(arr) {
const seen = new Set();
const duplicates = new Set();
for (const item of arr) {
if (seen.has(item)) {
duplicates.add(item);
} else {
seen.add(item);
}
}
return Array.from(duplicates);
}
```
**Lazy loading**:
```javascript
// Before: Load all data upfront
const allUsers = await User.findAll();
const allPosts = await Post.findAll();
// After: Load on demand
async function getUserWithPosts(userId) {
const user = await User.findById(userId);
// Only load posts when needed
if (needsPosts) {
user.posts = await Post.findByUserId(userId);
}
return user;
}
```
**Pagination**:
```javascript
// Before: Load all results
const results = await db.query('SELECT * FROM large_table');
// After: Paginate
const page = 1;
const pageSize = 100;
const results = await db.query(
'SELECT * FROM large_table LIMIT $1 OFFSET $2',
[pageSize, (page - 1) * pageSize]
);
```
#### Async Optimization
**Parallel execution**:
```javascript
// Before: Sequential (slow)
const user = await fetchUser();
const orders = await fetchOrders();
const payments = await fetchPayments();
// Total time: time(user) + time(orders) + time(payments)
// After: Parallel (fast)
const [user, orders, payments] = await Promise.all([
fetchUser(),
fetchOrders(),
fetchPayments()
]);
// Total time: max(time(user), time(orders), time(payments))
```
**Batch processing**:
```javascript
// Before: Process one at a time
for (const item of items) {
await processItem(item); // Slow for many items
}
// After: Process in batches
const batchSize = 10;
for (let i = 0; i < items.length; i += batchSize) {
const batch = items.slice(i, i + batchSize);
await Promise.all(batch.map(item => processItem(item)));
}
```
### 5. Load Testing
Verify optimizations under load:
**Load Testing Tools**:
**Apache Bench**:
```bash
# Simple load test
ab -n 10000 -c 100 http://localhost:3000/api/endpoint
# With keep-alive
ab -n 10000 -c 100 -k http://localhost:3000/api/endpoint
# POST with data
ab -n 1000 -c 10 -p data.json -T application/json http://localhost:3000/api/endpoint
```
**k6 (recommended)**:
```javascript
// load-test.js
import http from 'k6/http';
import { check, sleep } from 'k6';
export let options = {
stages: [
{ duration: '2m', target: 100 }, // Ramp up to 100 users
{ duration: '5m', target: 100 }, // Stay at 100 users
{ duration: '2m', target: 200 }, // Ramp up to 200 users
{ duration: '5m', target: 200 }, // Stay at 200 users
{ duration: '2m', target: 0 }, // Ramp down to 0
],
thresholds: {
http_req_duration: ['p(95)<500'], // 95% of requests < 500ms
http_req_failed: ['rate<0.01'], // Error rate < 1%
},
};
export default function () {
const response = http.get('http://localhost:3000/api/endpoint');
check(response, {
'status is 200': (r) => r.status === 200,
'response time < 500ms': (r) => r.timings.duration < 500,
});
sleep(1);
}
```
```bash
# Run load test
k6 run load-test.js
# With real-time monitoring
k6 run --out influxdb=http://localhost:8086/k6 load-test.js
```
**Artillery**:
```yaml
# load-test.yml
config:
target: 'http://localhost:3000'
phases:
- duration: 120
arrivalRate: 10
name: "Warm up"
- duration: 300
arrivalRate: 50
name: "Sustained load"
- duration: 120
arrivalRate: 100
name: "Peak load"
scenarios:
- name: "API endpoints"
flow:
- get:
url: "/api/users"
- get:
url: "/api/orders"
- post:
url: "/api/orders"
json:
userId: 123
amount: 100
```
```bash
# Run test
artillery run load-test.yml
# With report
artillery run --output report.json load-test.yml
artillery report report.json
```
### 6. Monitor Performance Improvements
Compare before and after:
**Metrics to Compare**:
```markdown
## Before Optimization
- Response time P50: 200ms
- Response time P95: 800ms
- Response time P99: 2000ms
- Throughput: 100 req/s
- Error rate: 2%
- CPU usage: 80%
- Memory usage: 1.5GB
## After Optimization
- Response time P50: 50ms ✅ 75% improvement
- Response time P95: 200ms ✅ 75% improvement
- Response time P99: 500ms ✅ 75% improvement
- Throughput: 400 req/s ✅ 4x improvement
- Error rate: 0.1% ✅ 20x improvement
- CPU usage: 40% ✅ 50% reduction
- Memory usage: 800MB ✅ 47% reduction
```
**Monitoring Dashboard**:
```javascript
// Expose metrics for Prometheus
const promClient = require('prom-client');
// Response time histogram
const httpDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});
// Throughput counter
const httpRequests = new promClient.Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// Middleware to track metrics
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpDuration.observe(
{ method: req.method, route: req.route?.path || req.path, status_code: res.statusCode },
duration
);
httpRequests.inc({
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode
});
});
next();
});
// Metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', promClient.register.contentType);
res.end(await promClient.register.metrics());
});
```
## Output Format
```markdown
# Performance Optimization Report: [Component Name]
## Summary
[Brief summary of optimization results]
## Performance Baseline
### Before Optimization
- **Response Time P50**: [value]ms
- **Response Time P95**: [value]ms
- **Response Time P99**: [value]ms
- **Throughput**: [value] req/s
- **Error Rate**: [value]%
- **CPU Usage**: [value]%
- **Memory Usage**: [value]MB
## Bottlenecks Identified
### Bottleneck 1: [Name]
- **Type**: [CPU|I/O|Memory|Network]
- **Location**: [file:line or component]
- **Impact**: [% of total time or resource usage]
- **Evidence**:
\`\`\`
[profiling data or logs showing bottleneck]
\`\`\`
### Bottleneck 2: [Name]
[similar structure]
## Optimizations Implemented
### Optimization 1: [Name]
**Problem**: [what was slow]
**Solution**: [what was done]
**Code Changes**:
\`\`\`[language]
// Before
[original slow code]
// After
[optimized code]
\`\`\`
**Impact**:
- Response time: [before]ms → [after]ms ([%] improvement)
- Resource usage: [before] → [after] ([%] improvement)
### Optimization 2: [Name]
[similar structure]
## Performance After Optimization
### After Optimization
- **Response Time P50**: [value]ms ✅ [%] improvement
- **Response Time P95**: [value]ms ✅ [%] improvement
- **Response Time P99**: [value]ms ✅ [%] improvement
- **Throughput**: [value] req/s ✅ [x]x improvement
- **Error Rate**: [value]% ✅ [%] improvement
- **CPU Usage**: [value]% ✅ [%] reduction
- **Memory Usage**: [value]MB ✅ [%] reduction
## Load Testing Results
### Test Configuration
- **Tool**: [k6|artillery|ab]
- **Duration**: [duration]
- **Peak Load**: [number] concurrent users
- **Total Requests**: [number]
### Results
\`\`\`
[load test output]
\`\`\`
### Performance Under Load
[Description of how system performed under sustained load]
## Profiling Data
### CPU Profile
[Flame graph or top CPU-consuming functions]
### Memory Profile
[Heap snapshots or memory allocation patterns]
### Query Performance
[Database query analysis results]
## Monitoring Setup
### Metrics Added
- [Metric 1]: Tracks [what]
- [Metric 2]: Tracks [what]
### Dashboards Created
- [Dashboard 1]: [URL and description]
- [Dashboard 2]: [URL and description]
### Alerts Configured
- [Alert 1]: Triggers when [condition]
- [Alert 2]: Triggers when [condition]
## Recommendations
### Additional Optimizations
1. [Optimization 1]: [Expected impact]
2. [Optimization 2]: [Expected impact]
### Monitoring
1. [What to monitor]
2. [What thresholds to set]
### Future Improvements
1. [Long-term improvement 1]
2. [Long-term improvement 2]
## Files Modified
- [file1]: [what was changed]
- [file2]: [what was changed]
## Verification Steps
### How to Verify
1. [Step 1]
2. [Step 2]
### Expected Behavior
[What should be observed]
## Next Steps
1. [Next step 1]
2. [Next step 2]
```
## Error Handling
**Optimization Degrades Performance**:
If optimization makes things slower:
1. Rollback immediately
2. Re-profile to understand why
3. Check for introduced overhead
4. Verify test methodology
**Cannot Reproduce Performance Issue**:
If issue only occurs in production:
1. Compare production vs test environment
2. Check production load patterns
3. Analyze production metrics
4. Consider production data characteristics
**Optimization Introduces Bugs**:
If optimization causes errors:
1. Rollback optimization
2. Add comprehensive tests
3. Implement optimization incrementally
4. Verify correctness at each step
## Integration with Other Operations
- **Before**: Use `/debug diagnose` to identify performance issues
- **Before**: Use `/debug analyze-logs` to understand performance patterns
- **After**: Use `/debug fix` to implement optimizations
- **Related**: Use `/debug memory` for memory-specific optimization
## Agent Utilization
This operation leverages the **10x-fullstack-engineer** agent for:
- Identifying performance bottlenecks across the stack
- Suggesting appropriate optimization strategies
- Implementing code optimizations
- Designing comprehensive load tests
- Interpreting profiling data

695
commands/debug/reproduce.md Normal file
View File

@@ -0,0 +1,695 @@
# Reproduce Operation - Issue Reproduction Strategies
You are executing the **reproduce** operation to create reliable reproduction strategies and test cases for debugging issues.
## Parameters
**Received**: `$ARGUMENTS` (after removing 'reproduce' operation name)
Expected format: `issue:"problem description" [environment:"prod|staging|dev"] [data:"test-data-location"] [steps:"reproduction-steps"] [reliability:"percentage"]`
## Workflow
### 1. Understand Reproduction Requirements
Gather information about the issue's behavior:
**Key Questions**:
- How often does the issue occur? (100%, 50%, 5%, etc.)
- Under what conditions? (specific data, timing, load, etc.)
- In which environments? (prod only, all environments)
- What is the expected vs actual behavior?
- Are there known workarounds?
**Reproduction Challenges to Identify**:
- **Timing-dependent** (race conditions, timeouts)
- **Data-dependent** (specific user data, edge cases)
- **Environment-dependent** (prod-only config, specific infrastructure)
- **Load-dependent** (only under high load or concurrency)
- **State-dependent** (requires specific sequence of actions)
### 2. Gather Reproduction Context
Collect all information needed to reproduce:
#### Environment Context
**Application State**:
```bash
# Get application version
git log -1 --oneline
npm list # Node dependencies
pip freeze # Python dependencies
# Get configuration
cat .env.production
echo $ENVIRONMENT_VARS
# Get deployed version in production
kubectl get deployment app-name -o jsonpath='{.spec.template.spec.containers[0].image}'
```
**Infrastructure State**:
```bash
# System resources
free -m
df -h
ulimit -a
# Network configuration
ip addr show
cat /etc/resolv.conf
# Service status
systemctl status application-service
docker ps
kubectl get pods
```
#### Data Context
**Database State**:
```sql
-- Get relevant data schema
\d+ table_name
-- Get sample data that triggers issue
SELECT * FROM users WHERE id = 'problematic-user-id';
-- Get data statistics
SELECT count(*), min(created_at), max(created_at) FROM table_name;
-- Export test data
COPY (SELECT * FROM users WHERE id IN ('user1', 'user2')) TO '/tmp/test_data.csv' CSV HEADER;
```
**Request/Response Data**:
```bash
# Capture failing request
# Use browser DevTools > Network > Copy as cURL
curl 'https://api.example.com/endpoint' \
-H 'Authorization: Bearer TOKEN' \
-H 'Content-Type: application/json' \
--data-raw '{"key":"value"}' \
-v # Verbose output
# Capture webhook payload
# Check logs for incoming webhook data
grep "webhook_payload" logs/application.log | jq .
```
#### User Context
**User Session**:
```javascript
// Browser state
console.log('LocalStorage:', localStorage);
console.log('SessionStorage:', sessionStorage);
console.log('Cookies:', document.cookie);
console.log('User Agent:', navigator.userAgent);
// Authentication state
console.log('Auth Token:', authToken);
console.log('Token Payload:', jwt.decode(authToken));
console.log('Session ID:', sessionId);
```
**User Actions**:
```markdown
1. User logs in as user@example.com
2. Navigates to /dashboard
3. Clicks "Upload File" button
4. Selects file > 10MB
5. Clicks "Submit"
6. Error occurs: "Request Entity Too Large"
```
### 3. Create Local Reproduction
Develop a strategy to reproduce the issue locally:
#### Strategy 1: Direct Reproduction
**For Simple Issues**:
```javascript
// Create minimal test case
function reproduceBug() {
// Setup
const testData = {
userId: 'test-user',
file: createLargeFile(15 * 1024 * 1024) // 15MB
};
// Execute problematic operation
const result = await uploadFile(testData);
// Verify issue occurs
assert(result.status === 413, 'Expected 413 error');
}
```
#### Strategy 2: Environment Simulation
**For Environment-Specific Issues**:
```bash
# Replicate production configuration locally
cp .env.production .env.local
sed -i 's/prod-database/localhost:5432/g' .env.local
# Use production data dump
psql local_db < production_data_dump.sql
# Run with production-like settings
NODE_ENV=production npm start
```
#### Strategy 3: Data-Driven Reproduction
**For Data-Specific Issues**:
```javascript
// Load production data that triggers issue
const testData = require('./test-data/problematic-user-data.json');
// Seed database with specific data
await db.users.insert(testData.user);
await db.orders.insertMany(testData.orders);
// Execute operation
const result = await processOrder(testData.orders[0].id);
```
#### Strategy 4: Timing-Based Reproduction
**For Race Conditions**:
```javascript
// Add delays to expose race condition
async function reproduceRaceCondition() {
// Start two operations simultaneously
const [result1, result2] = await Promise.all([
operation1(),
operation2()
]);
// Or use setTimeout to control timing
setTimeout(() => operation1(), 0);
setTimeout(() => operation2(), 1); // 1ms delay
}
// Add intentional delays to expose timing issues
async function operation() {
await fetchData();
await sleep(100); // Artificial delay
await processData(); // May fail if timing-dependent
}
```
#### Strategy 5: Load-Based Reproduction
**For Performance/Concurrency Issues**:
```javascript
// Simulate concurrent requests
async function reproduceUnderLoad() {
const concurrentRequests = 100;
const requests = Array(concurrentRequests)
.fill(null)
.map(() => makeRequest());
const results = await Promise.allSettled(requests);
const failures = results.filter(r => r.status === 'rejected');
console.log(`Failure rate: ${failures.length}/${concurrentRequests}`);
}
```
```bash
# Use load testing tools
ab -n 1000 -c 100 http://localhost:3000/api/endpoint
# Use k6 for more complex scenarios
k6 run load-test.js
# Monitor during load test
watch -n 1 'ps aux | grep node'
```
### 4. Verify Reproduction Reliability
Test that reproduction is reliable:
**Reliability Testing**:
```javascript
async function testReproductionReliability() {
const iterations = 50;
let failures = 0;
for (let i = 0; i < iterations; i++) {
try {
await reproduceIssue();
failures++; // Issue reproduced
} catch (error) {
// Issue did not reproduce
}
}
const reliability = (failures / iterations) * 100;
console.log(`Reproduction reliability: ${reliability}%`);
if (reliability < 80) {
console.warn('Reproduction is not reliable enough. Need to refine.');
}
}
```
**Improve Reliability**:
```javascript
// If reliability is low, add more constraints
async function improvedReproduction() {
// 1. Reset state between attempts
await resetDatabase();
await clearCache();
// 2. Add specific data constraints
const testUser = await createUserWithSpecificProfile({
accountAge: 30, // days
orderCount: 5,
subscriptionTier: 'premium'
});
// 3. Control timing precisely
await sleep(100); // Ensure service is ready
// 4. Set specific environment conditions
process.env.FEATURE_FLAG_X = 'true';
// Execute
await reproduceIssue();
}
```
### 5. Create Automated Test Case
Convert reproduction into automated test:
**Unit Test Example**:
```javascript
describe('File Upload Bug', () => {
beforeEach(async () => {
// Setup test environment
await resetTestDatabase();
await clearUploadDirectory();
});
it('should handle files larger than 10MB', async () => {
// Arrange
const largeFile = createTestFile(15 * 1024 * 1024);
const user = await createTestUser();
// Act
const response = await uploadFile(user.id, largeFile);
// Assert
expect(response.status).toBe(413);
expect(response.body.error).toContain('File too large');
});
it('should succeed with files under 10MB', async () => {
// Verify issue is specifically about size
const smallFile = createTestFile(5 * 1024 * 1024);
const user = await createTestUser();
const response = await uploadFile(user.id, smallFile);
expect(response.status).toBe(200);
});
});
```
**Integration Test Example**:
```javascript
describe('Order Processing Race Condition', () => {
it('should handle concurrent order updates safely', async () => {
// Setup
const order = await createTestOrder({ status: 'pending' });
// Simulate race condition
const updatePromises = [
updateOrderStatus(order.id, 'processing'),
updateOrderStatus(order.id, 'confirmed')
];
// Both should complete without error
await Promise.all(updatePromises);
// Verify final state is consistent
const finalOrder = await getOrder(order.id);
expect(['processing', 'confirmed']).toContain(finalOrder.status);
// Verify no data corruption
const auditLogs = await getOrderAuditLogs(order.id);
expect(auditLogs).toHaveLength(2);
});
});
```
**E2E Test Example**:
```javascript
describe('Dashboard Load Performance', () => {
it('should load dashboard under 2 seconds', async () => {
// Setup user with large dataset
const user = await createUserWithLargeDataset({
orders: 1000,
documents: 500
});
// Login
await page.goto('/login');
await page.fill('#email', user.email);
await page.fill('#password', 'testpass123');
await page.click('#login-button');
// Navigate to dashboard and measure time
const startTime = Date.now();
await page.goto('/dashboard');
await page.waitForSelector('.dashboard-loaded');
const loadTime = Date.now() - startTime;
// Assert performance
expect(loadTime).toBeLessThan(2000);
});
});
```
### 6. Document Reproduction Steps
Create comprehensive reproduction documentation:
**Reproduction Guide Template**:
```markdown
# Reproduction Guide: [Issue Name]
## Prerequisites
- Node.js v18.x
- PostgreSQL 14+
- Docker (optional)
- Test account credentials
## Environment Setup
### 1. Clone and Install
\`\`\`bash
git clone https://github.com/org/repo.git
cd repo
npm install
\`\`\`
### 2. Database Setup
\`\`\`bash
# Create test database
createdb test_app
# Load test data
psql test_app < test-data/problematic_data.sql
\`\`\`
### 3. Configuration
\`\`\`bash
# Copy test environment file
cp .env.test .env
# Update with test database URL
echo "DATABASE_URL=postgresql://localhost/test_app" >> .env
\`\`\`
## Reproduction Steps
### Manual Reproduction
1. Start the application:
\`\`\`bash
npm start
\`\`\`
2. Login with test user:
- Email: test@example.com
- Password: testpass123
3. Navigate to Dashboard: http://localhost:3000/dashboard
4. Click "Upload File" button
5. Select file larger than 10MB from test-data/
6. Click "Submit"
7. **Expected**: File uploads successfully
**Actual**: 413 Request Entity Too Large error
### Automated Reproduction
\`\`\`bash
# Run reproduction test
npm test -- tests/reproduction/file-upload-bug.test.js
# Expected output:
# ✓ reproduces 413 error with files > 10MB
# ✓ succeeds with files < 10MB
\`\`\`
## Reproduction Reliability
- **Success Rate**: 100% (fails every time)
- **Environment**: All environments
- **Conditions**: File size > 10MB
## Key Observations
- Issue occurs consistently with files > 10MB
- Works fine with files ≤ 10MB
- Error comes from Nginx, not application
- Content-Length header shows correct size
## Debugging Hints
- Check Nginx configuration: `/etc/nginx/nginx.conf`
- Look for `client_max_body_size` directive
- Application code may be fine, infrastructure issue
## Related Files
- test-data/large-file.bin (15MB test file)
- test-data/problematic_data.sql (test database dump)
- tests/reproduction/file-upload-bug.test.js (automated test)
```
### 7. Validate Different Scenarios
Test edge cases and variations:
**Scenario Matrix**:
```javascript
const testScenarios = [
// Vary file sizes
{ fileSize: '1MB', expected: 'success' },
{ fileSize: '10MB', expected: 'success' },
{ fileSize: '11MB', expected: 'failure' },
{ fileSize: '50MB', expected: 'failure' },
// Vary file types
{ fileType: 'image/jpeg', expected: 'success' },
{ fileType: 'application/pdf', expected: 'success' },
{ fileType: 'video/mp4', expected: 'failure' },
// Vary user types
{ userType: 'free', expected: 'failure' },
{ userType: 'premium', expected: 'success' },
// Vary environments
{ environment: 'local', expected: 'success' },
{ environment: 'staging', expected: 'failure' },
{ environment: 'production', expected: 'failure' }
];
for (const scenario of testScenarios) {
const result = await testScenario(scenario);
console.log(`Scenario ${JSON.stringify(scenario)}: ${result}`);
}
```
## Output Format
```markdown
# Reproduction Report: [Issue Name]
## Summary
[Brief description of reproduction strategy and success]
## Reproduction Reliability
- **Success Rate**: [percentage]%
- **Environment**: [local|staging|production|all]
- **Conditions**: [specific conditions needed]
- **Timing**: [immediate|delayed|intermittent]
## Prerequisites
### Environment Requirements
- [Software requirement 1]
- [Software requirement 2]
- [Configuration requirement 1]
### Data Requirements
- [Test data 1]
- [Test data 2]
- [Database state]
### Access Requirements
- [Credentials needed]
- [Permissions needed]
- [Resources needed]
## Reproduction Steps
### Quick Reproduction
\`\`\`bash
# Fastest way to reproduce
[commands to quickly reproduce the issue]
\`\`\`
### Detailed Reproduction
#### Step 1: [Setup]
\`\`\`bash
[detailed commands]
\`\`\`
[Expected result]
#### Step 2: [Preparation]
\`\`\`bash
[detailed commands]
\`\`\`
[Expected result]
#### Step 3: [Trigger Issue]
\`\`\`bash
[detailed commands]
\`\`\`
**Expected**: [expected behavior]
**Actual**: [actual behavior with issue]
## Automated Test Case
### Test Code
\`\`\`[language]
[Complete automated test that reproduces the issue]
\`\`\`
### Running the Test
\`\`\`bash
[command to run the test]
\`\`\`
### Expected Output
\`\`\`
[what the test output should show]
\`\`\`
## Scenario Variations
### Variation 1: [Description]
- **Conditions**: [conditions]
- **Result**: [occurs|does not occur]
- **Notes**: [observations]
### Variation 2: [Description]
- **Conditions**: [conditions]
- **Result**: [occurs|does not occur]
- **Notes**: [observations]
## Key Observations
### What Triggers the Issue
- [Trigger 1]
- [Trigger 2]
- [Trigger 3]
### What Prevents the Issue
- [Prevention 1]
- [Prevention 2]
### Minimal Reproduction
[Simplest possible way to reproduce]
## Test Data Files
### File 1: [filename]
**Location**: [path]
**Purpose**: [what this file is for]
**Contents**: [brief description]
### File 2: [filename]
**Location**: [path]
**Purpose**: [what this file is for]
**Contents**: [brief description]
## Troubleshooting Reproduction
### If Reproduction Fails
1. [Check 1]
2. [Check 2]
3. [Check 3]
### Common Issues
- **Issue**: [problem with reproduction]
**Solution**: [how to fix]
- **Issue**: [problem with reproduction]
**Solution**: [how to fix]
## Next Steps
1. **Diagnosis**: Use `/debug diagnose` with reproduction steps
2. **Fix**: Use `/debug fix` once root cause is identified
3. **Verification**: Re-run reproduction after fix to verify resolution
## Appendices
### A. Test Data
[Links to or contents of test data files]
### B. Environment Configuration
[Complete environment configuration needed]
### C. Video/Screenshots
[If applicable, links to recordings showing the issue]
```
## Error Handling
**Cannot Reproduce Locally**:
If issue cannot be reproduced in local environment:
1. Document what was tried
2. List environment differences
3. Suggest production debugging approach
4. Create monitoring to capture more data
**Unreliable Reproduction**:
If reproduction is intermittent:
1. Identify factors affecting reliability
2. Add more constraints to increase reliability
3. Document reliability percentage
4. Suggest statistical testing approach
**Missing Prerequisites**:
If prerequisites are unavailable:
1. List what's missing
2. Suggest alternatives
3. Propose workaround strategies
4. Document assumptions
## Integration with Other Operations
- **Before**: Use `/debug diagnose` to understand the issue first
- **After**: Use `/debug fix` to implement the fix
- **Related**: Use `/debug analyze-logs` to gather more reproduction context
## Agent Utilization
This operation leverages the **10x-fullstack-engineer** agent for:
- Creating reliable reproduction strategies
- Designing comprehensive test cases
- Identifying edge cases and variations
- Documenting reproduction steps clearly

83
commands/debug/skill.md Normal file
View File

@@ -0,0 +1,83 @@
---
description: Comprehensive debugging toolkit for complex issues - diagnosis, reproduction, log analysis, performance, and memory debugging
argument-hint: <operation> [parameters...]
model: inherit
---
# Debug Skill - Advanced Debugging Operations
You are routing requests to specialized debugging operations. Parse the `$ARGUMENTS` to determine which debugging operation to execute.
## Available Operations
- **diagnose** - Comprehensive diagnosis and root cause analysis across all stack layers
- **reproduce** - Create reliable reproduction strategies and test cases for issues
- **fix** - Implement targeted fixes with verification and prevention measures
- **analyze-logs** - Deep log analysis with pattern detection and timeline correlation
- **performance** - Performance debugging, profiling, and optimization
- **memory** - Memory leak detection, analysis, and optimization
## Routing Logic
Extract the first word from `$ARGUMENTS` as the operation name, and pass the remainder as operation parameters.
**Arguments received**: `$ARGUMENTS`
**Routing Instructions**:
1. **Parse the operation**: Extract the first word from `$ARGUMENTS`
2. **Load operation instructions**: Read the corresponding operation file from `.claude/commands/debug/`
3. **Execute with context**: Follow the operation's instructions with the remaining parameters
4. **Leverage agent**: All operations can leverage the 10x-fullstack-engineer agent for deep expertise
## Operation Routing
```
diagnose → Read and follow: .claude/commands/debug/diagnose.md
reproduce → Read and follow: .claude/commands/debug/reproduce.md
fix → Read and follow: .claude/commands/debug/fix.md
analyze-logs → Read and follow: .claude/commands/debug/analyze-logs.md
performance → Read and follow: .claude/commands/debug/performance.md
memory → Read and follow: .claude/commands/debug/memory.md
```
## Base Directory
All operation files are located at: `.claude/commands/debug/`
## Error Handling
If no operation is specified or the operation is not recognized:
**Available debugging operations**:
- `/debug diagnose issue:"..." [environment:"..."] [logs:"..."]` - Comprehensive diagnosis
- `/debug reproduce issue:"..." [environment:"..."] [data:"..."]` - Create reproduction strategy
- `/debug fix issue:"..." root_cause:"..." [verification:"..."]` - Implement targeted fix
- `/debug analyze-logs path:"..." [pattern:"..."] [timeframe:"..."]` - Deep log analysis
- `/debug performance component:"..." [metric:"..."] [threshold:"..."]` - Performance debugging
- `/debug memory component:"..." [symptom:"..."] [duration:"..."]` - Memory debugging
**Example usage**:
```
/debug diagnose issue:"Users getting 500 errors on file upload" environment:"production" logs:"logs/app.log"
/debug reproduce issue:"Payment webhook fails intermittently" environment:"staging" data:"sample-webhook-payload.json"
/debug fix issue:"Race condition in order processing" root_cause:"Missing transaction lock" verification:"run-integration-tests"
/debug analyze-logs path:"logs/application.log" pattern:"ERROR.*timeout" timeframe:"last-24h"
/debug performance component:"api-endpoint:/orders" metric:"response-time" threshold:"200ms"
/debug memory component:"background-worker" symptom:"growing-heap" duration:"6h"
```
Please specify an operation and provide the necessary parameters.
## Integration with 10x-fullstack-engineer Agent
All debugging operations are designed to work seamlessly with the 10x-fullstack-engineer agent, which provides:
- Cross-stack debugging expertise
- Systematic root cause analysis
- Production-grade debugging strategies
- Performance and security awareness
- Prevention-focused mindset
## Execution
Based on the parsed operation from `$ARGUMENTS`, read the appropriate operation file and follow its instructions with the remaining parameters.