Initial commit
This commit is contained in:
230
commands/debug/.scripts/analyze-logs.sh
Executable file
230
commands/debug/.scripts/analyze-logs.sh
Executable file
@@ -0,0 +1,230 @@
|
||||
#!/bin/bash
|
||||
# Purpose: Analyze log files for patterns, errors, and anomalies
|
||||
# Version: 1.0.0
|
||||
# Usage: ./analyze-logs.sh --file <log-file> [options]
|
||||
# Returns: 0=success, 1=error, 2=invalid params
|
||||
# Dependencies: awk, grep, sed, jq (optional for JSON logs)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default values
|
||||
LOG_FILE=""
|
||||
PATTERN=""
|
||||
LEVEL=""
|
||||
CONTEXT_LINES=5
|
||||
START_TIME=""
|
||||
END_TIME=""
|
||||
OUTPUT_FORMAT="text"
|
||||
SINCE=""
|
||||
|
||||
# Help message
|
||||
show_help() {
|
||||
cat << EOF
|
||||
Log Analysis Utility
|
||||
|
||||
Usage: $0 --file <log-file> [options]
|
||||
|
||||
Options:
|
||||
--file FILE Log file to analyze (required)
|
||||
--pattern REGEX Filter by regex pattern
|
||||
--level LEVEL Filter by log level (ERROR|WARN|INFO|DEBUG)
|
||||
--context N Show N lines before and after matches (default: 5)
|
||||
--start TIME Start time (format: "YYYY-MM-DD HH:MM:SS")
|
||||
--end TIME End time (format: "YYYY-MM-DD HH:MM:SS")
|
||||
--since DURATION Time ago (e.g., "1 hour ago", "30 minutes ago")
|
||||
--format FORMAT Output format: text|json (default: text)
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
# Find all errors in last hour
|
||||
$0 --file app.log --level ERROR --since "1 hour ago"
|
||||
|
||||
# Find timeout errors with context
|
||||
$0 --file app.log --pattern "timeout" --context 10
|
||||
|
||||
# Analyze specific timeframe
|
||||
$0 --file app.log --start "2024-10-14 14:00:00" --end "2024-10-14 15:00:00"
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--file)
|
||||
LOG_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--pattern)
|
||||
PATTERN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--level)
|
||||
LEVEL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--context)
|
||||
CONTEXT_LINES="$2"
|
||||
shift 2
|
||||
;;
|
||||
--start)
|
||||
START_TIME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--end)
|
||||
END_TIME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--since)
|
||||
SINCE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--format)
|
||||
OUTPUT_FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
;;
|
||||
*)
|
||||
echo -e "${RED}Error: Unknown option $1${NC}" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate required parameters
|
||||
if [ -z "$LOG_FILE" ]; then
|
||||
echo -e "${RED}Error: --file is required${NC}" >&2
|
||||
echo "Use --help for usage information"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ ! -f "$LOG_FILE" ]; then
|
||||
echo -e "${RED}Error: Log file not found: $LOG_FILE${NC}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Functions
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Convert "since" to start time
|
||||
if [ -n "$SINCE" ]; then
|
||||
if command -v date &> /dev/null; then
|
||||
START_TIME=$(date -d "$SINCE" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -v -1H '+%Y-%m-%d %H:%M:%S')
|
||||
fi
|
||||
fi
|
||||
|
||||
log_info "Analyzing log file: $LOG_FILE"
|
||||
|
||||
# Build grep command
|
||||
GREP_CMD="cat '$LOG_FILE'"
|
||||
|
||||
# Time filtering
|
||||
if [ -n "$START_TIME" ]; then
|
||||
log_info "Filtering from: $START_TIME"
|
||||
GREP_CMD="$GREP_CMD | awk '\$0 >= \"$START_TIME\"'"
|
||||
fi
|
||||
|
||||
if [ -n "$END_TIME" ]; then
|
||||
log_info "Filtering to: $END_TIME"
|
||||
GREP_CMD="$GREP_CMD | awk '\$0 <= \"$END_TIME\"'"
|
||||
fi
|
||||
|
||||
# Level filtering
|
||||
if [ -n "$LEVEL" ]; then
|
||||
log_info "Filtering by level: $LEVEL"
|
||||
GREP_CMD="$GREP_CMD | grep -i '$LEVEL'"
|
||||
fi
|
||||
|
||||
# Pattern filtering
|
||||
if [ -n "$PATTERN" ]; then
|
||||
log_info "Filtering by pattern: $PATTERN"
|
||||
GREP_CMD="$GREP_CMD | grep -E '$PATTERN' -A $CONTEXT_LINES -B $CONTEXT_LINES"
|
||||
fi
|
||||
|
||||
# Execute filtering
|
||||
FILTERED_OUTPUT=$(eval "$GREP_CMD")
|
||||
|
||||
if [ -z "$FILTERED_OUTPUT" ]; then
|
||||
log_warn "No matching log entries found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Count results
|
||||
MATCH_COUNT=$(echo "$FILTERED_OUTPUT" | wc -l)
|
||||
log_info "Found $MATCH_COUNT matching lines"
|
||||
|
||||
# Analysis
|
||||
echo ""
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
echo " LOG ANALYSIS RESULTS"
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
echo ""
|
||||
|
||||
# Error statistics
|
||||
echo "Error Statistics:"
|
||||
echo "─────────────────────────────────────────────────────────"
|
||||
ERROR_COUNT=$(echo "$FILTERED_OUTPUT" | grep -i "ERROR" | wc -l || echo "0")
|
||||
WARN_COUNT=$(echo "$FILTERED_OUTPUT" | grep -i "WARN" | wc -l || echo "0")
|
||||
INFO_COUNT=$(echo "$FILTERED_OUTPUT" | grep -i "INFO" | wc -l || echo "0")
|
||||
|
||||
echo " ERROR: $ERROR_COUNT"
|
||||
echo " WARN: $WARN_COUNT"
|
||||
echo " INFO: $INFO_COUNT"
|
||||
echo ""
|
||||
|
||||
# Top errors
|
||||
echo "Top Error Messages (Top 10):"
|
||||
echo "─────────────────────────────────────────────────────────"
|
||||
echo "$FILTERED_OUTPUT" | grep -i "ERROR" | awk -F'ERROR' '{print $2}' | sort | uniq -c | sort -rn | head -10 || echo " No errors found"
|
||||
echo ""
|
||||
|
||||
# Time distribution (if timestamps present)
|
||||
echo "Time Distribution:"
|
||||
echo "─────────────────────────────────────────────────────────"
|
||||
echo "$FILTERED_OUTPUT" | awk '{print substr($0, 1, 13)}' | sort | uniq -c | tail -20 || echo " No timestamp pattern detected"
|
||||
echo ""
|
||||
|
||||
# Output filtered results
|
||||
if [ "$OUTPUT_FORMAT" = "json" ]; then
|
||||
log_info "Generating JSON output..."
|
||||
# Simple JSON array of log lines
|
||||
echo "{"
|
||||
echo " \"file\": \"$LOG_FILE\","
|
||||
echo " \"matches\": $MATCH_COUNT,"
|
||||
echo " \"entries\": ["
|
||||
echo "$FILTERED_OUTPUT" | awk '{printf " \"%s\",\n", $0}' | sed '$ s/,$//'
|
||||
echo " ]"
|
||||
echo "}"
|
||||
else
|
||||
echo "Matching Log Entries:"
|
||||
echo "─────────────────────────────────────────────────────────"
|
||||
echo "$FILTERED_OUTPUT"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
log_success "Analysis complete"
|
||||
exit 0
|
||||
418
commands/debug/.scripts/memory-check.sh
Executable file
418
commands/debug/.scripts/memory-check.sh
Executable file
@@ -0,0 +1,418 @@
|
||||
#!/bin/bash
|
||||
# Purpose: Monitor memory usage and detect leaks
|
||||
# Version: 1.0.0
|
||||
# Usage: ./memory-check.sh --app <app-name> [options]
|
||||
# Returns: 0=success, 1=error, 2=invalid params
|
||||
# Dependencies: ps, awk, bc
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default values
|
||||
APP_NAME=""
|
||||
DURATION=300
|
||||
INTERVAL=10
|
||||
THRESHOLD=1024
|
||||
OUTPUT_DIR="./memory-check-output"
|
||||
ALERT_ON_GROWTH=true
|
||||
|
||||
# Help message
|
||||
show_help() {
|
||||
cat << EOF
|
||||
Memory Monitoring Utility
|
||||
|
||||
Usage: $0 --app <app-name> [options]
|
||||
|
||||
Options:
|
||||
--app NAME Application/process name to monitor (required)
|
||||
--duration N Monitoring duration in seconds (default: 300)
|
||||
--interval N Sampling interval in seconds (default: 10)
|
||||
--threshold MB Alert if memory exceeds threshold in MB (default: 1024)
|
||||
--output DIR Output directory (default: ./memory-check-output)
|
||||
--no-alert Disable growth alerts
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
# Monitor Node.js app for 5 minutes
|
||||
$0 --app node --duration 300
|
||||
|
||||
# Monitor with custom threshold
|
||||
$0 --app node --duration 600 --threshold 2048
|
||||
|
||||
# Quick check (1 minute)
|
||||
$0 --app node --duration 60 --interval 5
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--app)
|
||||
APP_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--duration)
|
||||
DURATION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--interval)
|
||||
INTERVAL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--threshold)
|
||||
THRESHOLD="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output)
|
||||
OUTPUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-alert)
|
||||
ALERT_ON_GROWTH=false
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
;;
|
||||
*)
|
||||
echo -e "${RED}Error: Unknown option $1${NC}" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate required parameters
|
||||
if [ -z "$APP_NAME" ]; then
|
||||
echo -e "${RED}Error: --app is required${NC}" >&2
|
||||
echo "Use --help for usage information"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Functions
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
alert() {
|
||||
echo -e "${RED}[ALERT]${NC} $1"
|
||||
}
|
||||
|
||||
# Create output directory
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
log_info "Starting memory monitoring for: $APP_NAME"
|
||||
log_info "Duration: ${DURATION}s, Interval: ${INTERVAL}s, Threshold: ${THRESHOLD}MB"
|
||||
log_info "Output directory: $OUTPUT_DIR"
|
||||
|
||||
# Find process ID
|
||||
PIDS=$(pgrep -f "$APP_NAME" || echo "")
|
||||
if [ -z "$PIDS" ]; then
|
||||
log_error "No process found matching: $APP_NAME"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PID=$(echo "$PIDS" | head -1)
|
||||
log_info "Found process: PID $PID"
|
||||
|
||||
# Output files
|
||||
MEMORY_LOG="$OUTPUT_DIR/memory-log-$TIMESTAMP.txt"
|
||||
CHART_FILE="$OUTPUT_DIR/memory-chart-$TIMESTAMP.txt"
|
||||
REPORT_FILE="$OUTPUT_DIR/memory-report-$TIMESTAMP.txt"
|
||||
|
||||
# Write header
|
||||
echo "Timestamp,RSS_KB,VSZ_KB,Percent_MEM" > "$MEMORY_LOG"
|
||||
|
||||
log_info "Monitoring memory usage..."
|
||||
|
||||
# Track min/max
|
||||
MIN_RSS=0
|
||||
MAX_RSS=0
|
||||
READINGS=()
|
||||
|
||||
# Collect memory samples
|
||||
SAMPLES=$((DURATION / INTERVAL))
|
||||
for i in $(seq 1 $SAMPLES); do
|
||||
# Get memory stats
|
||||
MEM_STATS=$(ps -p "$PID" -o rss=,vsz=,%mem= 2>/dev/null || echo "")
|
||||
|
||||
if [ -z "$MEM_STATS" ]; then
|
||||
log_error "Process $PID not found. It may have terminated."
|
||||
break
|
||||
fi
|
||||
|
||||
# Parse values
|
||||
RSS=$(echo "$MEM_STATS" | awk '{print $1}')
|
||||
VSZ=$(echo "$MEM_STATS" | awk '{print $2}')
|
||||
PMEM=$(echo "$MEM_STATS" | awk '{print $3}')
|
||||
TIMESTAMP_NOW=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# Update min/max
|
||||
if [ "$MIN_RSS" -eq 0 ] || [ "$RSS" -lt "$MIN_RSS" ]; then
|
||||
MIN_RSS=$RSS
|
||||
fi
|
||||
if [ "$RSS" -gt "$MAX_RSS" ]; then
|
||||
MAX_RSS=$RSS
|
||||
fi
|
||||
|
||||
# Store reading
|
||||
READINGS+=($RSS)
|
||||
|
||||
# Log to file
|
||||
echo "$TIMESTAMP_NOW,$RSS,$VSZ,$PMEM" >> "$MEMORY_LOG"
|
||||
|
||||
# Convert to MB for display
|
||||
RSS_MB=$(echo "scale=2; $RSS/1024" | bc)
|
||||
VSZ_MB=$(echo "scale=2; $VSZ/1024" | bc)
|
||||
|
||||
# Progress display
|
||||
echo -ne "\r Sample $i/$SAMPLES: RSS=${RSS_MB}MB, VSZ=${VSZ_MB}MB, %MEM=${PMEM}% "
|
||||
|
||||
# Check threshold
|
||||
if (( $(echo "$RSS_MB > $THRESHOLD" | bc -l) )); then
|
||||
echo "" # New line before alert
|
||||
alert "Memory threshold exceeded: ${RSS_MB}MB > ${THRESHOLD}MB"
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
echo "" # New line after progress
|
||||
|
||||
log_success "Memory monitoring complete"
|
||||
|
||||
# Calculate statistics
|
||||
MIN_MB=$(echo "scale=2; $MIN_RSS/1024" | bc)
|
||||
MAX_MB=$(echo "scale=2; $MAX_RSS/1024" | bc)
|
||||
GROWTH_MB=$(echo "scale=2; ($MAX_RSS-$MIN_RSS)/1024" | bc)
|
||||
|
||||
# Calculate average
|
||||
TOTAL_RSS=0
|
||||
for rss in "${READINGS[@]}"; do
|
||||
TOTAL_RSS=$((TOTAL_RSS + rss))
|
||||
done
|
||||
AVG_RSS=$((TOTAL_RSS / ${#READINGS[@]}))
|
||||
AVG_MB=$(echo "scale=2; $AVG_RSS/1024" | bc)
|
||||
|
||||
# Detect leak (memory consistently growing)
|
||||
LEAK_DETECTED=false
|
||||
if (( $(echo "$GROWTH_MB > 50" | bc -l) )); then
|
||||
# Check if growth is consistent (not just spike)
|
||||
FIRST_HALF_AVG=0
|
||||
SECOND_HALF_AVG=0
|
||||
MID_POINT=$((${#READINGS[@]} / 2))
|
||||
|
||||
for i in $(seq 0 $((MID_POINT - 1))); do
|
||||
FIRST_HALF_AVG=$((FIRST_HALF_AVG + READINGS[$i]))
|
||||
done
|
||||
FIRST_HALF_AVG=$((FIRST_HALF_AVG / MID_POINT))
|
||||
|
||||
for i in $(seq $MID_POINT $((${#READINGS[@]} - 1))); do
|
||||
SECOND_HALF_AVG=$((SECOND_HALF_AVG + READINGS[$i]))
|
||||
done
|
||||
SECOND_HALF_AVG=$((SECOND_HALF_AVG / (${#READINGS[@]} - MID_POINT)))
|
||||
|
||||
CONSISTENT_GROWTH=$((SECOND_HALF_AVG - FIRST_HALF_AVG))
|
||||
CONSISTENT_GROWTH_MB=$(echo "scale=2; $CONSISTENT_GROWTH/1024" | bc)
|
||||
|
||||
if (( $(echo "$CONSISTENT_GROWTH_MB > 25" | bc -l) )); then
|
||||
LEAK_DETECTED=true
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate ASCII chart
|
||||
log_info "Generating memory chart..."
|
||||
|
||||
cat > "$CHART_FILE" << EOF
|
||||
Memory Usage Over Time
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
RSS (Resident Set Size) in MB
|
||||
|
||||
EOF
|
||||
|
||||
# Simple ASCII chart (40 rows, scale based on max)
|
||||
CHART_HEIGHT=20
|
||||
SCALE_FACTOR=$(echo "scale=2; $MAX_RSS / $CHART_HEIGHT" | bc)
|
||||
|
||||
for row in $(seq $CHART_HEIGHT -1 0); do
|
||||
THRESHOLD_LINE=$(echo "scale=0; $row * $SCALE_FACTOR / 1024" | bc)
|
||||
printf "%4d MB |" "$THRESHOLD_LINE"
|
||||
|
||||
for reading in "${READINGS[@]}"; do
|
||||
READING_ROW=$(echo "scale=0; $reading / $SCALE_FACTOR" | bc)
|
||||
|
||||
if [ "$READING_ROW" -ge "$row" ]; then
|
||||
printf "█"
|
||||
else
|
||||
printf " "
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
printf " +"
|
||||
for i in $(seq 1 ${#READINGS[@]}); do printf "─"; done
|
||||
echo ""
|
||||
|
||||
printf " "
|
||||
for i in $(seq 1 ${#READINGS[@]}); do
|
||||
if [ $((i % 10)) -eq 0 ]; then
|
||||
printf "|"
|
||||
else
|
||||
printf " "
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
cat >> "$CHART_FILE" << EOF
|
||||
|
||||
Legend: Each column = ${INTERVAL}s interval
|
||||
Total duration: ${DURATION}s
|
||||
EOF
|
||||
|
||||
cat "$CHART_FILE"
|
||||
|
||||
# Generate report
|
||||
log_info "Generating memory report..."
|
||||
|
||||
cat > "$REPORT_FILE" << EOF
|
||||
═══════════════════════════════════════════════════════════
|
||||
MEMORY MONITORING REPORT
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
Application: $APP_NAME
|
||||
PID: $PID
|
||||
Duration: ${DURATION}s (${SAMPLES} samples)
|
||||
Interval: ${INTERVAL}s
|
||||
Timestamp: $TIMESTAMP
|
||||
|
||||
Memory Statistics:
|
||||
─────────────────────────────────────────────────────────
|
||||
Minimum RSS: ${MIN_MB} MB
|
||||
Maximum RSS: ${MAX_MB} MB
|
||||
Average RSS: ${AVG_MB} MB
|
||||
Memory Growth: ${GROWTH_MB} MB
|
||||
Threshold: ${THRESHOLD} MB
|
||||
|
||||
EOF
|
||||
|
||||
# Leak analysis
|
||||
if [ "$LEAK_DETECTED" = true ]; then
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
⚠ MEMORY LEAK DETECTED
|
||||
─────────────────────────────────────────────────────────
|
||||
Memory grew consistently by ${CONSISTENT_GROWTH_MB} MB
|
||||
First half average: $(echo "scale=2; $FIRST_HALF_AVG/1024" | bc) MB
|
||||
Second half average: $(echo "scale=2; $SECOND_HALF_AVG/1024" | bc) MB
|
||||
|
||||
Recommendations:
|
||||
1. Take heap snapshots for detailed analysis
|
||||
2. Check for:
|
||||
- Event listeners not removed
|
||||
- Timers not cleared (setInterval, setTimeout)
|
||||
- Unbounded caches or arrays
|
||||
- Circular references
|
||||
- Closures holding large objects
|
||||
3. Use memory profiling tools:
|
||||
- Node.js: node --inspect, heap snapshots
|
||||
- Python: memory_profiler, tracemalloc
|
||||
4. Consider using /debug memory operation for deeper analysis
|
||||
|
||||
EOF
|
||||
|
||||
if [ "$ALERT_ON_GROWTH" = true ]; then
|
||||
alert "MEMORY LEAK DETECTED! Growth: ${CONSISTENT_GROWTH_MB} MB"
|
||||
fi
|
||||
else
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
✓ NO MEMORY LEAK DETECTED
|
||||
─────────────────────────────────────────────────────────
|
||||
Memory usage is stable
|
||||
Growth of ${GROWTH_MB} MB is within acceptable range
|
||||
|
||||
EOF
|
||||
log_success "No memory leak detected"
|
||||
fi
|
||||
|
||||
# Threshold warnings
|
||||
if (( $(echo "$MAX_MB > $THRESHOLD" | bc -l) )); then
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
⚠ THRESHOLD EXCEEDED
|
||||
─────────────────────────────────────────────────────────
|
||||
Peak memory (${MAX_MB} MB) exceeded threshold (${THRESHOLD} MB)
|
||||
|
||||
Recommendations:
|
||||
1. Increase memory allocation if necessary
|
||||
2. Optimize memory usage:
|
||||
- Use streaming for large data
|
||||
- Implement pagination
|
||||
- Use efficient data structures
|
||||
- Clear unused objects
|
||||
3. Set appropriate container/VM memory limits
|
||||
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Output files
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
Output Files:
|
||||
─────────────────────────────────────────────────────────
|
||||
Memory Log: $MEMORY_LOG
|
||||
Memory Chart: $CHART_FILE
|
||||
This Report: $REPORT_FILE
|
||||
|
||||
Next Steps:
|
||||
─────────────────────────────────────────────────────────
|
||||
EOF
|
||||
|
||||
if [ "$LEAK_DETECTED" = true ]; then
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
1. Use /debug memory for heap profiling
|
||||
2. Take heap snapshots before and after operations
|
||||
3. Review code for common leak patterns
|
||||
4. Monitor production with these findings
|
||||
EOF
|
||||
else
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
1. Continue monitoring in production
|
||||
2. Set up alerts for memory threshold
|
||||
3. Schedule periodic memory checks
|
||||
EOF
|
||||
fi
|
||||
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "═══════════════════════════════════════════════════════════" >> "$REPORT_FILE"
|
||||
|
||||
log_success "Report saved to: $REPORT_FILE"
|
||||
|
||||
# Display report
|
||||
cat "$REPORT_FILE"
|
||||
|
||||
# Exit with appropriate code
|
||||
if [ "$LEAK_DETECTED" = true ]; then
|
||||
exit 1
|
||||
else
|
||||
exit 0
|
||||
fi
|
||||
297
commands/debug/.scripts/profile.sh
Executable file
297
commands/debug/.scripts/profile.sh
Executable file
@@ -0,0 +1,297 @@
|
||||
#!/bin/bash
|
||||
# Purpose: Profile application performance (CPU, memory, I/O)
|
||||
# Version: 1.0.0
|
||||
# Usage: ./profile.sh --app <app-name> [options]
|
||||
# Returns: 0=success, 1=error, 2=invalid params
|
||||
# Dependencies: ps, top, pidstat (optional)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default values
|
||||
APP_NAME=""
|
||||
DURATION=60
|
||||
INTERVAL=1
|
||||
OUTPUT_DIR="./profile-output"
|
||||
PROFILE_TYPE="all"
|
||||
ENDPOINT=""
|
||||
|
||||
# Help message
|
||||
show_help() {
|
||||
cat << EOF
|
||||
Application Profiling Utility
|
||||
|
||||
Usage: $0 --app <app-name> [options]
|
||||
|
||||
Options:
|
||||
--app NAME Application/process name to profile (required)
|
||||
--duration N Profile duration in seconds (default: 60)
|
||||
--interval N Sampling interval in seconds (default: 1)
|
||||
--type TYPE Profile type: cpu|memory|io|all (default: all)
|
||||
--endpoint URL Optional: HTTP endpoint to load test during profiling
|
||||
--output DIR Output directory (default: ./profile-output)
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
# Profile Node.js app for 2 minutes
|
||||
$0 --app node --duration 120
|
||||
|
||||
# Profile with load test
|
||||
$0 --app node --duration 60 --endpoint http://localhost:3000/api/test
|
||||
|
||||
# Profile only CPU
|
||||
$0 --app node --duration 30 --type cpu
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--app)
|
||||
APP_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--duration)
|
||||
DURATION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--interval)
|
||||
INTERVAL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--type)
|
||||
PROFILE_TYPE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--endpoint)
|
||||
ENDPOINT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output)
|
||||
OUTPUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
;;
|
||||
*)
|
||||
echo -e "${RED}Error: Unknown option $1${NC}" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate required parameters
|
||||
if [ -z "$APP_NAME" ]; then
|
||||
echo -e "${RED}Error: --app is required${NC}" >&2
|
||||
echo "Use --help for usage information"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Functions
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Create output directory
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
log_info "Starting profiling for: $APP_NAME"
|
||||
log_info "Duration: ${DURATION}s, Interval: ${INTERVAL}s"
|
||||
log_info "Output directory: $OUTPUT_DIR"
|
||||
|
||||
# Find process ID
|
||||
PIDS=$(pgrep -f "$APP_NAME" || echo "")
|
||||
if [ -z "$PIDS" ]; then
|
||||
log_error "No process found matching: $APP_NAME"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PID=$(echo "$PIDS" | head -1)
|
||||
log_info "Found process: PID $PID"
|
||||
|
||||
# Start load test if endpoint provided
|
||||
LOAD_TEST_PID=""
|
||||
if [ -n "$ENDPOINT" ]; then
|
||||
log_info "Starting load test on: $ENDPOINT"
|
||||
|
||||
if command -v ab &> /dev/null; then
|
||||
# Use Apache Bench
|
||||
ab -n 100000 -c 10 "$ENDPOINT" > "$OUTPUT_DIR/load-test-$TIMESTAMP.log" 2>&1 &
|
||||
LOAD_TEST_PID=$!
|
||||
log_info "Load test started (PID: $LOAD_TEST_PID)"
|
||||
else
|
||||
log_warn "Apache Bench (ab) not found, skipping load test"
|
||||
fi
|
||||
fi
|
||||
|
||||
# CPU Profiling
|
||||
if [ "$PROFILE_TYPE" = "cpu" ] || [ "$PROFILE_TYPE" = "all" ]; then
|
||||
log_info "Profiling CPU usage..."
|
||||
|
||||
CPU_OUTPUT="$OUTPUT_DIR/cpu-profile-$TIMESTAMP.txt"
|
||||
|
||||
# Collect CPU samples
|
||||
for i in $(seq 1 $DURATION); do
|
||||
ps -p "$PID" -o %cpu,rss,vsz,cmd >> "$CPU_OUTPUT" 2>/dev/null || true
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
log_success "CPU profile saved to: $CPU_OUTPUT"
|
||||
|
||||
# Calculate statistics
|
||||
AVG_CPU=$(awk 'NR>1 {sum+=$1; count++} END {if (count>0) print sum/count; else print 0}' "$CPU_OUTPUT")
|
||||
MAX_CPU=$(awk 'NR>1 {if ($1>max) max=$1} END {print max+0}' "$CPU_OUTPUT")
|
||||
|
||||
echo "CPU Statistics:" > "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt"
|
||||
echo " Average CPU: $AVG_CPU%" >> "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt"
|
||||
echo " Peak CPU: $MAX_CPU%" >> "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt"
|
||||
fi
|
||||
|
||||
# Memory Profiling
|
||||
if [ "$PROFILE_TYPE" = "memory" ] || [ "$PROFILE_TYPE" = "all" ]; then
|
||||
log_info "Profiling memory usage..."
|
||||
|
||||
MEM_OUTPUT="$OUTPUT_DIR/memory-profile-$TIMESTAMP.txt"
|
||||
|
||||
# Collect memory samples
|
||||
for i in $(seq 1 $DURATION); do
|
||||
ps -p "$PID" -o rss,vsz,%mem,cmd >> "$MEM_OUTPUT" 2>/dev/null || true
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
log_success "Memory profile saved to: $MEM_OUTPUT"
|
||||
|
||||
# Calculate statistics
|
||||
AVG_RSS=$(awk 'NR>1 {sum+=$1; count++} END {if (count>0) print sum/count; else print 0}' "$MEM_OUTPUT")
|
||||
MAX_RSS=$(awk 'NR>1 {if ($1>max) max=$1} END {print max+0}' "$MEM_OUTPUT")
|
||||
MIN_RSS=$(awk 'NR>1 {if (min=="") min=$1; if ($1<min) min=$1} END {print min+0}' "$MEM_OUTPUT")
|
||||
|
||||
echo "Memory Statistics:" > "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
|
||||
echo " Average RSS: $(echo "scale=2; $AVG_RSS/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
|
||||
echo " Peak RSS: $(echo "scale=2; $MAX_RSS/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
|
||||
echo " Min RSS: $(echo "scale=2; $MIN_RSS/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
|
||||
echo " Memory Growth: $(echo "scale=2; ($MAX_RSS-$MIN_RSS)/1024" | bc) MB" >> "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt"
|
||||
fi
|
||||
|
||||
# I/O Profiling
|
||||
if [ "$PROFILE_TYPE" = "io" ] || [ "$PROFILE_TYPE" = "all" ]; then
|
||||
log_info "Profiling I/O usage..."
|
||||
|
||||
IO_OUTPUT="$OUTPUT_DIR/io-profile-$TIMESTAMP.txt"
|
||||
|
||||
# Check if process has I/O stats available
|
||||
if [ -f "/proc/$PID/io" ]; then
|
||||
# Collect I/O samples
|
||||
for i in $(seq 1 $DURATION); do
|
||||
echo "=== Sample $i ===" >> "$IO_OUTPUT"
|
||||
cat "/proc/$PID/io" >> "$IO_OUTPUT" 2>/dev/null || true
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
log_success "I/O profile saved to: $IO_OUTPUT"
|
||||
else
|
||||
log_warn "I/O profiling not available for this process"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Stop load test if running
|
||||
if [ -n "$LOAD_TEST_PID" ]; then
|
||||
log_info "Stopping load test..."
|
||||
kill "$LOAD_TEST_PID" 2>/dev/null || true
|
||||
wait "$LOAD_TEST_PID" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Generate summary report
|
||||
REPORT_FILE="$OUTPUT_DIR/profile-report-$TIMESTAMP.txt"
|
||||
|
||||
cat > "$REPORT_FILE" << EOF
|
||||
═══════════════════════════════════════════════════════════
|
||||
PERFORMANCE PROFILE REPORT
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
Application: $APP_NAME
|
||||
PID: $PID
|
||||
Duration: ${DURATION}s
|
||||
Interval: ${INTERVAL}s
|
||||
Timestamp: $TIMESTAMP
|
||||
|
||||
EOF
|
||||
|
||||
# Add CPU summary if available
|
||||
if [ -f "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" ]; then
|
||||
cat "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" >> "$REPORT_FILE"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
fi
|
||||
|
||||
# Add memory summary if available
|
||||
if [ -f "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt" ]; then
|
||||
cat "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt" >> "$REPORT_FILE"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
fi
|
||||
|
||||
# Add recommendations
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
Recommendations:
|
||||
─────────────────────────────────────────────────────────
|
||||
|
||||
EOF
|
||||
|
||||
if [ -f "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" ]; then
|
||||
MAX_CPU=$(awk '/Peak CPU:/ {print $3}' "$OUTPUT_DIR/cpu-summary-$TIMESTAMP.txt" | sed 's/%//')
|
||||
if [ -n "$MAX_CPU" ] && (( $(echo "$MAX_CPU > 80" | bc -l) )); then
|
||||
echo " ⚠ High CPU usage detected (${MAX_CPU}%)" >> "$REPORT_FILE"
|
||||
echo " - Consider optimizing CPU-intensive operations" >> "$REPORT_FILE"
|
||||
echo " - Profile with flame graphs for detailed analysis" >> "$REPORT_FILE"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt" ]; then
|
||||
GROWTH=$(awk '/Memory Growth:/ {print $3}' "$OUTPUT_DIR/memory-summary-$TIMESTAMP.txt")
|
||||
if [ -n "$GROWTH" ] && (( $(echo "$GROWTH > 100" | bc -l) )); then
|
||||
echo " ⚠ Significant memory growth detected (${GROWTH} MB)" >> "$REPORT_FILE"
|
||||
echo " - Possible memory leak" >> "$REPORT_FILE"
|
||||
echo " - Use heap profiling to identify leak sources" >> "$REPORT_FILE"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
fi
|
||||
fi
|
||||
|
||||
cat >> "$REPORT_FILE" << EOF
|
||||
Output Files:
|
||||
─────────────────────────────────────────────────────────
|
||||
EOF
|
||||
|
||||
ls -lh "$OUTPUT_DIR"/*-$TIMESTAMP.* >> "$REPORT_FILE"
|
||||
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "═══════════════════════════════════════════════════════════" >> "$REPORT_FILE"
|
||||
|
||||
log_success "Profile complete!"
|
||||
log_info "Report saved to: $REPORT_FILE"
|
||||
|
||||
# Display summary
|
||||
cat "$REPORT_FILE"
|
||||
|
||||
exit 0
|
||||
596
commands/debug/README.md
Normal file
596
commands/debug/README.md
Normal file
@@ -0,0 +1,596 @@
|
||||
# Debug Skill - Comprehensive Debugging Toolkit
|
||||
|
||||
A professional-grade debugging skill for diagnosing, reproducing, fixing, analyzing, and optimizing complex issues across the entire application stack.
|
||||
|
||||
## Overview
|
||||
|
||||
The debug skill provides systematic debugging operations that work seamlessly with the **10x-fullstack-engineer** agent to deliver cross-stack debugging expertise, production-grade strategies, and prevention-focused solutions.
|
||||
|
||||
## Available Operations
|
||||
|
||||
### 1. **diagnose** - Comprehensive Diagnosis and Root Cause Analysis
|
||||
|
||||
Performs systematic diagnosis across all layers of the application stack to identify root causes of complex issues.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
/10x-fullstack-engineer:debug diagnose issue:"Users getting 500 errors on file upload" environment:"production" logs:"logs/app.log"
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `issue:"description"` (required) - Problem description
|
||||
- `environment:"prod|staging|dev"` (optional) - Target environment
|
||||
- `logs:"path"` (optional) - Log file location
|
||||
- `reproduction:"steps"` (optional) - Steps to reproduce
|
||||
- `impact:"severity"` (optional) - Issue severity
|
||||
|
||||
**What it does:**
|
||||
- Collects diagnostic data from frontend, backend, database, and infrastructure
|
||||
- Analyzes symptoms and patterns across all stack layers
|
||||
- Forms and tests hypotheses systematically
|
||||
- Identifies root cause with supporting evidence
|
||||
- Provides actionable recommendations
|
||||
|
||||
**Output:**
|
||||
- Executive summary of issue and root cause
|
||||
- Detailed diagnostic data from each layer
|
||||
- Hypothesis analysis with evidence
|
||||
- Root cause explanation
|
||||
- Recommended immediate actions and permanent fix
|
||||
- Prevention measures (monitoring, testing, documentation)
|
||||
|
||||
---
|
||||
|
||||
### 2. **reproduce** - Create Reliable Reproduction Strategies
|
||||
|
||||
Develops reliable strategies to reproduce issues consistently, creating test cases and reproduction documentation.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
/10x-fullstack-engineer:debug reproduce issue:"Payment webhook fails intermittently" environment:"staging" data:"sample-webhook-payload.json"
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `issue:"description"` (required) - Issue to reproduce
|
||||
- `environment:"prod|staging|dev"` (optional) - Environment context
|
||||
- `data:"path"` (optional) - Test data location
|
||||
- `steps:"description"` (optional) - Known reproduction steps
|
||||
- `reliability:"percentage"` (optional) - Current reproduction rate
|
||||
|
||||
**What it does:**
|
||||
- Gathers environment, data, and user context
|
||||
- Creates local reproduction strategy
|
||||
- Develops automated test cases (unit, integration, E2E)
|
||||
- Tests scenario variations and edge cases
|
||||
- Verifies reproduction reliability
|
||||
- Documents comprehensive reproduction guide
|
||||
|
||||
**Output:**
|
||||
- Reproduction reliability metrics
|
||||
- Prerequisites and setup instructions
|
||||
- Detailed reproduction steps (manual and automated)
|
||||
- Automated test case code
|
||||
- Scenario variations tested
|
||||
- Troubleshooting guide for reproduction issues
|
||||
|
||||
---
|
||||
|
||||
### 3. **fix** - Implement Targeted Fixes with Verification
|
||||
|
||||
Implements targeted fixes with comprehensive verification, safeguards, and prevention measures.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
/10x-fullstack-engineer:debug fix issue:"Race condition in order processing" root_cause:"Missing transaction lock" verification:"run-integration-tests"
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `issue:"description"` (required) - Issue being fixed
|
||||
- `root_cause:"cause"` (required) - Identified root cause
|
||||
- `verification:"strategy"` (optional) - Verification approach
|
||||
- `scope:"areas"` (optional) - Affected code areas
|
||||
- `rollback:"plan"` (optional) - Rollback strategy
|
||||
|
||||
**What it does:**
|
||||
- Designs appropriate fix pattern for the issue type
|
||||
- Implements fix with safety measures
|
||||
- Adds safeguards (validation, rate limiting, circuit breakers)
|
||||
- Performs multi-level verification (unit, integration, load, production)
|
||||
- Adds prevention measures (tests, monitoring, alerts)
|
||||
- Documents fix and deployment plan
|
||||
|
||||
**Fix patterns supported:**
|
||||
- Missing error handling
|
||||
- Race conditions
|
||||
- Memory leaks
|
||||
- Missing validation
|
||||
- N+1 query problems
|
||||
- Configuration issues
|
||||
- Infrastructure limits
|
||||
|
||||
**Output:**
|
||||
- Detailed fix implementation with before/after code
|
||||
- Safeguards added (validation, error handling, monitoring)
|
||||
- Verification results at all levels
|
||||
- Prevention measures (tests, alerts, documentation)
|
||||
- Deployment plan with rollback strategy
|
||||
- Files modified and commits made
|
||||
|
||||
---
|
||||
|
||||
### 4. **analyze-logs** - Deep Log Analysis with Pattern Detection
|
||||
|
||||
Performs deep log analysis with pattern detection, timeline correlation, and anomaly identification.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
/10x-fullstack-engineer:debug analyze-logs path:"logs/application.log" pattern:"ERROR.*timeout" timeframe:"last-24h"
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `path:"log-file-path"` (required) - Log file to analyze
|
||||
- `pattern:"regex"` (optional) - Filter pattern
|
||||
- `timeframe:"range"` (optional) - Time range to analyze
|
||||
- `level:"error|warn|info"` (optional) - Log level filter
|
||||
- `context:"lines"` (optional) - Context lines around matches
|
||||
|
||||
**What it does:**
|
||||
- Discovers and filters relevant logs across all sources
|
||||
- Detects error patterns and clusters similar errors
|
||||
- Performs timeline analysis and event correlation
|
||||
- Traces individual requests across services
|
||||
- Identifies statistical anomalies and spikes
|
||||
- Analyzes performance, user impact, and security issues
|
||||
|
||||
**Utility script:**
|
||||
```bash
|
||||
./commands/debug/.scripts/analyze-logs.sh \
|
||||
--file logs/application.log \
|
||||
--level ERROR \
|
||||
--since "1 hour ago" \
|
||||
--context 5
|
||||
```
|
||||
|
||||
**Output:**
|
||||
- Summary of findings with key statistics
|
||||
- Top errors with frequency and patterns
|
||||
- Timeline of critical events
|
||||
- Request tracing through distributed system
|
||||
- Anomaly detection (spikes, new errors)
|
||||
- Performance analysis from logs
|
||||
- User impact assessment
|
||||
- Root cause analysis based on log patterns
|
||||
- Recommendations for fixes and monitoring
|
||||
|
||||
---
|
||||
|
||||
### 5. **performance** - Performance Debugging and Optimization
|
||||
|
||||
Debugs performance issues through profiling, bottleneck identification, and targeted optimization.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
/10x-fullstack-engineer:debug performance component:"api-endpoint:/orders" metric:"response-time" threshold:"200ms"
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `component:"name"` (required) - Component to profile
|
||||
- `metric:"type"` (optional) - Metric to measure (response-time, throughput, cpu, memory)
|
||||
- `threshold:"value"` (optional) - Target performance threshold
|
||||
- `duration:"period"` (optional) - Profiling duration
|
||||
- `load:"users"` (optional) - Concurrent users for load testing
|
||||
|
||||
**What it does:**
|
||||
- Establishes performance baseline
|
||||
- Profiles application, database, and network
|
||||
- Identifies bottlenecks (CPU, I/O, memory, network)
|
||||
- Implements targeted optimizations (queries, caching, algorithms, async)
|
||||
- Performs load testing to verify improvements
|
||||
- Sets up performance monitoring
|
||||
|
||||
**Profiling utility script:**
|
||||
```bash
|
||||
./commands/debug/.scripts/profile.sh \
|
||||
--app node_app \
|
||||
--duration 60 \
|
||||
--endpoint http://localhost:3000/api/slow
|
||||
```
|
||||
|
||||
**Optimization strategies:**
|
||||
- Query optimization (indexes, query rewriting)
|
||||
- Caching (application-level, Redis)
|
||||
- Code optimization (algorithms, lazy loading, pagination)
|
||||
- Async optimization (parallel execution, batching)
|
||||
|
||||
**Output:**
|
||||
- Performance baseline and after-optimization metrics
|
||||
- Bottlenecks identified with evidence
|
||||
- Optimizations implemented with code changes
|
||||
- Load testing results
|
||||
- Performance improvement percentages
|
||||
- Monitoring setup (metrics, dashboards, alerts)
|
||||
- Recommendations for additional optimizations
|
||||
|
||||
---
|
||||
|
||||
### 6. **memory** - Memory Leak Detection and Optimization
|
||||
|
||||
Detects memory leaks, analyzes memory usage patterns, and optimizes memory consumption.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
/10x-fullstack-engineer:debug memory component:"background-worker" symptom:"growing-heap" duration:"6h"
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `component:"name"` (required) - Component to analyze
|
||||
- `symptom:"type"` (optional) - Memory symptom (growing-heap, high-usage, oom)
|
||||
- `duration:"period"` (optional) - Observation period
|
||||
- `threshold:"max-mb"` (optional) - Memory threshold in MB
|
||||
- `profile:"type"` (optional) - Profile type (heap, allocation)
|
||||
|
||||
**What it does:**
|
||||
- Identifies memory symptoms (leaks, high usage, OOM)
|
||||
- Captures memory profiles (heap snapshots, allocation tracking)
|
||||
- Analyzes common leak patterns
|
||||
- Implements memory optimizations
|
||||
- Performs leak verification under load
|
||||
- Tunes garbage collection
|
||||
|
||||
**Memory check utility script:**
|
||||
```bash
|
||||
./commands/debug/.scripts/memory-check.sh \
|
||||
--app node_app \
|
||||
--duration 300 \
|
||||
--interval 10 \
|
||||
--threshold 1024
|
||||
```
|
||||
|
||||
**Common leak patterns detected:**
|
||||
- Event listeners not removed
|
||||
- Timers not cleared
|
||||
- Closures holding references
|
||||
- Unbounded caches
|
||||
- Global variable accumulation
|
||||
- Detached DOM nodes
|
||||
- Infinite promise chains
|
||||
|
||||
**Optimization techniques:**
|
||||
- Stream large data instead of loading into memory
|
||||
- Use efficient data structures (Map vs Array)
|
||||
- Paginate database queries
|
||||
- Implement LRU caches with size limits
|
||||
- Use weak references where appropriate
|
||||
- Object pooling for frequently created objects
|
||||
|
||||
**Output:**
|
||||
- Memory symptoms and baseline metrics
|
||||
- Heap snapshot analysis
|
||||
- Memory leaks identified with evidence
|
||||
- Fixes implemented with before/after code
|
||||
- Memory after fixes with improvement percentages
|
||||
- Memory stability test results
|
||||
- Garbage collection metrics
|
||||
- Monitoring setup and alerts
|
||||
- Recommendations for memory limits and future monitoring
|
||||
|
||||
---
|
||||
|
||||
## Utility Scripts
|
||||
|
||||
The debug skill includes three utility scripts in `.scripts/` directory:
|
||||
|
||||
### analyze-logs.sh
|
||||
**Purpose:** Analyze log files for patterns, errors, and anomalies
|
||||
|
||||
**Features:**
|
||||
- Pattern matching with regex
|
||||
- Log level filtering
|
||||
- Time-based filtering
|
||||
- Context lines around matches
|
||||
- Error statistics and top errors
|
||||
- Time distribution analysis
|
||||
- JSON output support
|
||||
|
||||
### profile.sh
|
||||
**Purpose:** Profile application performance (CPU, memory, I/O)
|
||||
|
||||
**Features:**
|
||||
- CPU profiling with statistics
|
||||
- Memory profiling with growth detection
|
||||
- I/O profiling
|
||||
- Concurrent load testing
|
||||
- Automated recommendations
|
||||
- Comprehensive reports
|
||||
|
||||
### memory-check.sh
|
||||
**Purpose:** Monitor memory usage and detect leaks
|
||||
|
||||
**Features:**
|
||||
- Real-time memory monitoring
|
||||
- Memory growth detection
|
||||
- Leak detection with trend analysis
|
||||
- ASCII memory usage charts
|
||||
- Threshold alerts
|
||||
- Detailed memory reports
|
||||
|
||||
---
|
||||
|
||||
## Common Debugging Workflows
|
||||
|
||||
### Workflow 1: Production Error Investigation
|
||||
|
||||
```bash
|
||||
# Step 1: Diagnose the issue
|
||||
/10x-fullstack-engineer:debug diagnose issue:"500 errors on checkout" environment:"production" logs:"logs/app.log"
|
||||
|
||||
# Step 2: Analyze logs for patterns
|
||||
/10x-fullstack-engineer:debug analyze-logs path:"logs/app.log" pattern:"checkout.*ERROR" timeframe:"last-1h"
|
||||
|
||||
# Step 3: Reproduce locally
|
||||
/10x-fullstack-engineer:debug reproduce issue:"Checkout fails with 500" environment:"staging" data:"test-checkout.json"
|
||||
|
||||
# Step 4: Implement fix
|
||||
/10x-fullstack-engineer:debug fix issue:"Database timeout on checkout" root_cause:"Missing connection pool configuration"
|
||||
```
|
||||
|
||||
### Workflow 2: Performance Degradation
|
||||
|
||||
```bash
|
||||
# Step 1: Profile performance
|
||||
/10x-fullstack-engineer:debug performance component:"api-endpoint:/checkout" metric:"response-time" threshold:"500ms"
|
||||
|
||||
# Step 2: Analyze slow queries
|
||||
/10x-fullstack-engineer:debug analyze-logs path:"logs/postgresql.log" pattern:"duration:.*[0-9]{4,}"
|
||||
|
||||
# Step 3: Implement optimization
|
||||
/10x-fullstack-engineer:debug fix issue:"Slow checkout API" root_cause:"N+1 query on order items"
|
||||
```
|
||||
|
||||
### Workflow 3: Memory Leak Investigation
|
||||
|
||||
```bash
|
||||
# Step 1: Diagnose memory symptoms
|
||||
/10x-fullstack-engineer:debug diagnose issue:"Memory grows over time" environment:"production"
|
||||
|
||||
# Step 2: Profile memory usage
|
||||
/10x-fullstack-engineer:debug memory component:"background-processor" symptom:"growing-heap" duration:"1h"
|
||||
|
||||
# Step 3: Implement fix
|
||||
/10x-fullstack-engineer:debug fix issue:"Memory leak in event handlers" root_cause:"Event listeners not removed"
|
||||
```
|
||||
|
||||
### Workflow 4: Intermittent Failure
|
||||
|
||||
```bash
|
||||
# Step 1: Reproduce reliably
|
||||
/10x-fullstack-engineer:debug reproduce issue:"Random payment failures" environment:"staging"
|
||||
|
||||
# Step 2: Diagnose with reproduction
|
||||
/10x-fullstack-engineer:debug diagnose issue:"Payment webhook fails intermittently" reproduction:"steps-from-reproduce"
|
||||
|
||||
# Step 3: Analyze timing
|
||||
/10x-fullstack-engineer:debug analyze-logs path:"logs/webhooks.log" pattern:"payment.*fail" context:10
|
||||
|
||||
# Step 4: Fix race condition
|
||||
/10x-fullstack-engineer:debug fix issue:"Race condition in webhook handler" root_cause:"Concurrent webhook processing"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration with 10x-fullstack-engineer Agent
|
||||
|
||||
All debugging operations are designed to work with the **10x-fullstack-engineer** agent, which provides:
|
||||
|
||||
- **Cross-stack debugging expertise** - Systematic analysis across frontend, backend, database, and infrastructure
|
||||
- **Systematic root cause analysis** - Hypothesis formation, testing, and evidence-based conclusions
|
||||
- **Production-grade debugging strategies** - Safe, reliable approaches suitable for production environments
|
||||
- **Performance and security awareness** - Considers performance impact and security implications
|
||||
- **Prevention-focused mindset** - Not just fixing issues, but preventing future occurrences
|
||||
|
||||
The agent brings deep expertise in:
|
||||
- Full-stack architecture patterns
|
||||
- Performance optimization techniques
|
||||
- Memory management and leak detection
|
||||
- Database query optimization
|
||||
- Distributed systems debugging
|
||||
- Production safety and deployment strategies
|
||||
|
||||
---
|
||||
|
||||
## Debugging Best Practices
|
||||
|
||||
### 1. Start with Diagnosis
|
||||
Always begin with `/debug diagnose` to understand the full scope of the issue before attempting fixes.
|
||||
|
||||
### 2. Reproduce Reliably
|
||||
Use `/debug reproduce` to create reproducible test cases. A bug that can't be reliably reproduced is hard to fix and verify.
|
||||
|
||||
### 3. Analyze Logs Systematically
|
||||
Use `/debug analyze-logs` to find patterns and correlations. Look for:
|
||||
- Error frequency and distribution
|
||||
- Timeline correlation with deployments
|
||||
- Anomalies and spikes
|
||||
- Request tracing across services
|
||||
|
||||
### 4. Profile Before Optimizing
|
||||
Use `/debug performance` and `/debug memory` to identify actual bottlenecks. Don't optimize based on assumptions.
|
||||
|
||||
### 5. Fix with Verification
|
||||
Use `/debug fix` which includes:
|
||||
- Proper error handling
|
||||
- Comprehensive testing
|
||||
- Monitoring and alerts
|
||||
- Documentation
|
||||
|
||||
### 6. Add Prevention Measures
|
||||
Every fix should include:
|
||||
- Regression tests
|
||||
- Monitoring metrics
|
||||
- Alerts on thresholds
|
||||
- Documentation updates
|
||||
|
||||
---
|
||||
|
||||
## Output Documentation
|
||||
|
||||
Each operation generates comprehensive reports in markdown format:
|
||||
|
||||
- **Executive summaries** for stakeholders
|
||||
- **Detailed technical analysis** for engineers
|
||||
- **Code snippets** with before/after comparisons
|
||||
- **Evidence and metrics** supporting conclusions
|
||||
- **Actionable recommendations** with priorities
|
||||
- **Next steps** with clear instructions
|
||||
|
||||
Reports include:
|
||||
- Issue description and symptoms
|
||||
- Analysis methodology and findings
|
||||
- Root cause explanation with evidence
|
||||
- Fixes implemented with code
|
||||
- Verification results
|
||||
- Prevention measures added
|
||||
- Files modified and commits
|
||||
- Monitoring and alerting setup
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
All operations include robust error handling:
|
||||
|
||||
- **Insufficient information** - Lists what's needed and how to gather it
|
||||
- **Cannot reproduce** - Suggests alternative debugging approaches
|
||||
- **Fix verification fails** - Provides re-diagnosis steps
|
||||
- **Optimization degrades performance** - Includes rollback procedures
|
||||
- **Environment differences** - Helps bridge local vs production gaps
|
||||
|
||||
---
|
||||
|
||||
## Common Debugging Scenarios
|
||||
|
||||
### Database Performance Issues
|
||||
1. Use `/debug performance` to establish baseline
|
||||
2. Use `/debug analyze-logs` on database slow query logs
|
||||
3. Identify missing indexes or inefficient queries
|
||||
4. Use `/debug fix` to implement optimization
|
||||
5. Verify with load testing
|
||||
|
||||
### Memory Leaks
|
||||
1. Use `/debug diagnose` to identify symptoms
|
||||
2. Use `/debug memory` to capture heap profiles
|
||||
3. Identify leak patterns (event listeners, timers, caches)
|
||||
4. Use `/debug fix` to implement cleanup
|
||||
5. Verify with sustained load testing
|
||||
|
||||
### Intermittent Errors
|
||||
1. Use `/debug analyze-logs` to find error patterns
|
||||
2. Use `/debug reproduce` to create reliable reproduction
|
||||
3. Use `/debug diagnose` with reproduction steps
|
||||
4. Identify timing or concurrency issues
|
||||
5. Use `/debug fix` to implement proper synchronization
|
||||
|
||||
### Production Incidents
|
||||
1. Use `/debug diagnose` for rapid root cause analysis
|
||||
2. Use `/debug analyze-logs` for recent time period
|
||||
3. Implement immediate mitigation (rollback, circuit breaker)
|
||||
4. Use `/debug reproduce` to prevent recurrence
|
||||
5. Use `/debug fix` for permanent solution
|
||||
|
||||
### Performance Degradation
|
||||
1. Use `/debug performance` to compare against baseline
|
||||
2. Identify bottlenecks (CPU, I/O, memory, network)
|
||||
3. Use `/debug analyze-logs` for slow operations
|
||||
4. Implement targeted optimizations
|
||||
5. Verify improvements with load testing
|
||||
|
||||
---
|
||||
|
||||
## Tips and Tricks
|
||||
|
||||
### Effective Log Analysis
|
||||
- Use pattern matching to find related errors
|
||||
- Look for request IDs to trace across services
|
||||
- Check timestamps for correlation with deployments
|
||||
- Compare error rates before and after changes
|
||||
- Use context lines to understand error conditions
|
||||
|
||||
### Performance Profiling
|
||||
- Profile production-like workloads
|
||||
- Use realistic data sizes
|
||||
- Test under sustained load, not just peak
|
||||
- Profile both CPU and memory together
|
||||
- Use flame graphs for visual analysis
|
||||
|
||||
### Memory Debugging
|
||||
- Force GC between measurements for accuracy
|
||||
- Take multiple heap snapshots over time
|
||||
- Look for objects that never get collected
|
||||
- Check for consistent growth, not just spikes
|
||||
- Verify fixes with extended monitoring
|
||||
|
||||
### Reproduction Strategies
|
||||
- Minimize reproduction to essential steps
|
||||
- Control timing with explicit delays
|
||||
- Use specific test data that triggers issue
|
||||
- Document environment differences
|
||||
- Aim for >80% reproduction reliability
|
||||
|
||||
---
|
||||
|
||||
## File Locations
|
||||
|
||||
```
|
||||
plugins/10x-fullstack-engineer/commands/debug/
|
||||
├── skill.md # Router/orchestrator
|
||||
├── diagnose.md # Diagnosis operation
|
||||
├── reproduce.md # Reproduction operation
|
||||
├── fix.md # Fix implementation operation
|
||||
├── analyze-logs.md # Log analysis operation
|
||||
├── performance.md # Performance debugging operation
|
||||
├── memory.md # Memory debugging operation
|
||||
├── .scripts/
|
||||
│ ├── analyze-logs.sh # Log analysis utility
|
||||
│ ├── profile.sh # Performance profiling utility
|
||||
│ └── memory-check.sh # Memory monitoring utility
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Requirements
|
||||
|
||||
- **Node.js operations**: Node.js runtime with `--inspect` or `--prof` flags for profiling
|
||||
- **Log analysis**: Standard Unix tools (awk, grep, sed), optional jq for JSON logs
|
||||
- **Performance profiling**: Apache Bench (ab), k6, or Artillery for load testing
|
||||
- **Memory profiling**: Chrome DevTools, clinic.js, or memwatch for Node.js
|
||||
- **Database profiling**: Access to database query logs and EXPLAIN ANALYZE capability
|
||||
|
||||
---
|
||||
|
||||
## Support and Troubleshooting
|
||||
|
||||
If operations fail:
|
||||
1. Check that required parameters are provided
|
||||
2. Verify file paths and permissions
|
||||
3. Ensure utility scripts are executable (`chmod +x .scripts/*.sh`)
|
||||
4. Check that prerequisite tools are installed
|
||||
5. Review error messages for specific issues
|
||||
|
||||
For complex debugging scenarios:
|
||||
- Start with `/debug diagnose` for systematic analysis
|
||||
- Use multiple operations in sequence for comprehensive investigation
|
||||
- Leverage the 10x-fullstack-engineer agent's expertise
|
||||
- Document findings and share with team
|
||||
|
||||
---
|
||||
|
||||
## Version
|
||||
|
||||
Debug Skill v1.0.0
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
Part of the 10x-fullstack-engineer plugin for Claude Code.
|
||||
842
commands/debug/analyze-logs.md
Normal file
842
commands/debug/analyze-logs.md
Normal file
@@ -0,0 +1,842 @@
|
||||
# Analyze Logs Operation - Deep Log Analysis
|
||||
|
||||
You are executing the **analyze-logs** operation to perform deep log analysis with pattern detection, timeline correlation, and anomaly identification.
|
||||
|
||||
## Parameters
|
||||
|
||||
**Received**: `$ARGUMENTS` (after removing 'analyze-logs' operation name)
|
||||
|
||||
Expected format: `path:"log-file-path" [pattern:"regex-pattern"] [timeframe:"time-range"] [level:"error|warn|info"] [context:"lines-before-after"]`
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Discover and Locate Logs
|
||||
|
||||
Identify all relevant log sources:
|
||||
|
||||
**Application Logs**:
|
||||
```bash
|
||||
# Common log locations
|
||||
ls -lh /var/log/application/
|
||||
ls -lh logs/
|
||||
ls -lh ~/.pm2/logs/
|
||||
|
||||
# Find log files
|
||||
find /var/log -name "*.log" -type f
|
||||
find . -name "*.log" -mtime -1 # Modified in last 24 hours
|
||||
|
||||
# Check log rotation
|
||||
ls -lh /var/log/application/*.log*
|
||||
zcat /var/log/application/app.log.*.gz # Read rotated logs
|
||||
```
|
||||
|
||||
**System Logs**:
|
||||
```bash
|
||||
# Systemd service logs
|
||||
journalctl -u application.service --since "1 hour ago"
|
||||
journalctl -u application.service --since "2024-10-14 14:00:00"
|
||||
|
||||
# Syslog
|
||||
tail -f /var/log/syslog
|
||||
tail -f /var/log/messages
|
||||
|
||||
# Kernel logs
|
||||
dmesg -T
|
||||
```
|
||||
|
||||
**Container Logs**:
|
||||
```bash
|
||||
# Docker
|
||||
docker logs container-name --since 1h
|
||||
docker logs container-name --timestamps
|
||||
docker logs --tail 1000 container-name > container-logs.txt
|
||||
|
||||
# Kubernetes
|
||||
kubectl logs pod-name -c container-name
|
||||
kubectl logs pod-name --previous # Previous container
|
||||
kubectl logs -l app=myapp --all-containers=true
|
||||
```
|
||||
|
||||
**Web Server Logs**:
|
||||
```bash
|
||||
# Nginx
|
||||
tail -f /var/log/nginx/access.log
|
||||
tail -f /var/log/nginx/error.log
|
||||
|
||||
# Apache
|
||||
tail -f /var/log/apache2/access.log
|
||||
tail -f /var/log/apache2/error.log
|
||||
```
|
||||
|
||||
**Database Logs**:
|
||||
```bash
|
||||
# PostgreSQL
|
||||
tail -f /var/log/postgresql/postgresql-*.log
|
||||
|
||||
# MySQL
|
||||
tail -f /var/log/mysql/error.log
|
||||
tail -f /var/log/mysql/slow-query.log
|
||||
|
||||
# MongoDB
|
||||
tail -f /var/log/mongodb/mongod.log
|
||||
```
|
||||
|
||||
### 2. Filter and Extract Relevant Logs
|
||||
|
||||
Use the `.scripts/analyze-logs.sh` utility to extract relevant log entries:
|
||||
|
||||
**Basic Extraction**:
|
||||
```bash
|
||||
# Extract errors from last hour
|
||||
./commands/debug/.scripts/analyze-logs.sh \
|
||||
--file logs/application.log \
|
||||
--level ERROR \
|
||||
--since "1 hour ago"
|
||||
|
||||
# Extract with pattern matching
|
||||
./commands/debug/.scripts/analyze-logs.sh \
|
||||
--file logs/application.log \
|
||||
--pattern "timeout|connection.*refused" \
|
||||
--context 5
|
||||
|
||||
# Extract specific timeframe
|
||||
./commands/debug/.scripts/analyze-logs.sh \
|
||||
--file logs/application.log \
|
||||
--start "2024-10-14 14:00:00" \
|
||||
--end "2024-10-14 15:00:00"
|
||||
```
|
||||
|
||||
**Manual Filtering**:
|
||||
```bash
|
||||
# Find errors with context
|
||||
grep -i "error" logs/application.log -A 5 -B 5
|
||||
|
||||
# Find specific error patterns
|
||||
grep -E "(timeout|refused|failed)" logs/application.log
|
||||
|
||||
# Find errors in timeframe
|
||||
awk '/2024-10-14 14:/ && /ERROR/ {print}' logs/application.log
|
||||
|
||||
# Count errors by type
|
||||
grep "ERROR" logs/application.log | awk '{print $5}' | sort | uniq -c | sort -rn
|
||||
|
||||
# Extract JSON logs with jq
|
||||
cat logs/application.log | jq 'select(.level == "error")'
|
||||
cat logs/application.log | jq 'select(.message | contains("timeout"))'
|
||||
```
|
||||
|
||||
### 3. Pattern Detection
|
||||
|
||||
Identify patterns in log data:
|
||||
|
||||
#### Error Patterns
|
||||
|
||||
**Frequency Analysis**:
|
||||
```bash
|
||||
# Error frequency over time
|
||||
grep "ERROR" logs/application.log | \
|
||||
awk '{print $1, $2}' | \
|
||||
cut -d: -f1 | \
|
||||
uniq -c
|
||||
|
||||
# Most common errors
|
||||
grep "ERROR" logs/application.log | \
|
||||
awk -F'ERROR' '{print $2}' | \
|
||||
sort | uniq -c | sort -rn | head -20
|
||||
|
||||
# Error rate calculation
|
||||
total_lines=$(wc -l < logs/application.log)
|
||||
error_lines=$(grep -c "ERROR" logs/application.log)
|
||||
echo "Error rate: $(echo "scale=4; $error_lines / $total_lines * 100" | bc)%"
|
||||
```
|
||||
|
||||
**Error Clustering**:
|
||||
```python
|
||||
# Group similar errors
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
def normalize_error(error_msg):
|
||||
# Remove numbers, IDs, timestamps
|
||||
error_msg = re.sub(r'\d+', 'N', error_msg)
|
||||
error_msg = re.sub(r'[a-f0-9-]{36}', 'UUID', error_msg)
|
||||
error_msg = re.sub(r'\d{4}-\d{2}-\d{2}', 'DATE', error_msg)
|
||||
return error_msg
|
||||
|
||||
errors = []
|
||||
with open('logs/application.log') as f:
|
||||
for line in f:
|
||||
if 'ERROR' in line:
|
||||
normalized = normalize_error(line)
|
||||
errors.append(normalized)
|
||||
|
||||
# Count error types
|
||||
error_counts = Counter(errors)
|
||||
for error, count in error_counts.most_common(10):
|
||||
print(f"{count}: {error}")
|
||||
```
|
||||
|
||||
#### Request Patterns
|
||||
|
||||
**Request Analysis**:
|
||||
```bash
|
||||
# Requests per minute
|
||||
awk '{print $1}' /var/log/nginx/access.log | \
|
||||
cut -d: -f1-2 | \
|
||||
uniq -c
|
||||
|
||||
# Most requested endpoints
|
||||
awk '{print $7}' /var/log/nginx/access.log | \
|
||||
sort | uniq -c | sort -rn | head -20
|
||||
|
||||
# Response code distribution
|
||||
awk '{print $9}' /var/log/nginx/access.log | \
|
||||
sort | uniq -c | sort -rn
|
||||
|
||||
# Slow requests (>1 second)
|
||||
awk '$10 > 1.0 {print $0}' /var/log/nginx/access.log
|
||||
|
||||
# Top user agents
|
||||
awk -F'"' '{print $6}' /var/log/nginx/access.log | \
|
||||
sort | uniq -c | sort -rn | head -10
|
||||
```
|
||||
|
||||
#### Performance Patterns
|
||||
|
||||
**Response Time Analysis**:
|
||||
```bash
|
||||
# Average response time
|
||||
awk '{sum+=$10; count++} END {print "Average:", sum/count}' \
|
||||
/var/log/nginx/access.log
|
||||
|
||||
# Response time percentiles
|
||||
awk '{print $10}' /var/log/nginx/access.log | \
|
||||
sort -n | \
|
||||
awk '{
|
||||
times[NR] = $1
|
||||
}
|
||||
END {
|
||||
print "P50:", times[int(NR*0.5)]
|
||||
print "P95:", times[int(NR*0.95)]
|
||||
print "P99:", times[int(NR*0.99)]
|
||||
}'
|
||||
|
||||
# Response time over time
|
||||
awk '{print $4, $10}' /var/log/nginx/access.log | \
|
||||
awk -F'[:]' '{print $1":"$2, $NF}' | \
|
||||
awk '{sum[$1]+=$2; count[$1]++} END {
|
||||
for (time in sum) print time, sum[time]/count[time]
|
||||
}' | sort
|
||||
```
|
||||
|
||||
### 4. Timeline Analysis
|
||||
|
||||
Create timeline of events:
|
||||
|
||||
**Timeline Construction**:
|
||||
```bash
|
||||
# Merge multiple log sources by timestamp
|
||||
sort -m -k1,2 \
|
||||
logs/application.log \
|
||||
logs/database.log \
|
||||
logs/nginx.log \
|
||||
> merged-timeline.log
|
||||
|
||||
# Extract timeline around specific event
|
||||
event_time="2024-10-14 14:30:15"
|
||||
grep "$event_time" logs/application.log -B 100 -A 100
|
||||
|
||||
# Timeline with multiple sources
|
||||
for log in logs/*.log; do
|
||||
echo "=== $(basename $log) ==="
|
||||
grep "$event_time" "$log" -B 10 -A 10
|
||||
echo ""
|
||||
done
|
||||
```
|
||||
|
||||
**Event Correlation**:
|
||||
```python
|
||||
# Correlate events across log sources
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
def parse_log_line(line):
|
||||
# Extract timestamp and message
|
||||
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', line)
|
||||
if match:
|
||||
timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
|
||||
return timestamp, line
|
||||
return None, None
|
||||
|
||||
# Load events from multiple logs
|
||||
events = []
|
||||
for log_file in ['app.log', 'db.log', 'nginx.log']:
|
||||
with open(f'logs/{log_file}') as f:
|
||||
for line in f:
|
||||
timestamp, message = parse_log_line(line)
|
||||
if timestamp:
|
||||
events.append((timestamp, log_file, message))
|
||||
|
||||
# Sort by timestamp
|
||||
events.sort(key=lambda x: x[0])
|
||||
|
||||
# Find events within time window
|
||||
def find_related_events(target_time, window_seconds=10):
|
||||
window = timedelta(seconds=window_seconds)
|
||||
start_time = target_time - window
|
||||
end_time = target_time + window
|
||||
|
||||
related = [
|
||||
event for event in events
|
||||
if start_time <= event[0] <= end_time
|
||||
]
|
||||
|
||||
return related
|
||||
|
||||
# Analyze error event
|
||||
error_time = datetime(2024, 10, 14, 14, 30, 15)
|
||||
related = find_related_events(error_time)
|
||||
|
||||
for timestamp, source, message in related:
|
||||
print(f"[{source}] {timestamp}: {message.strip()}")
|
||||
```
|
||||
|
||||
### 5. Request Tracing
|
||||
|
||||
Trace individual requests across services:
|
||||
|
||||
**Request ID Tracing**:
|
||||
```bash
|
||||
# Extract request ID from error
|
||||
error_line=$(grep "ERROR" logs/application.log | head -1)
|
||||
request_id=$(echo "$error_line" | grep -oP 'request_id=\K[a-f0-9-]+')
|
||||
|
||||
echo "Tracing request: $request_id"
|
||||
|
||||
# Find all log entries for this request
|
||||
grep "$request_id" logs/application.log
|
||||
|
||||
# Across multiple services
|
||||
for log in logs/*.log; do
|
||||
echo "=== $(basename $log) ==="
|
||||
grep "$request_id" "$log"
|
||||
done
|
||||
|
||||
# With timestamps for timeline
|
||||
grep "$request_id" logs/*.log | sort -k1,2
|
||||
```
|
||||
|
||||
**Distributed Tracing Correlation**:
|
||||
```bash
|
||||
# Extract trace ID from logs
|
||||
trace_id=$(grep "ERROR" logs/application.log | \
|
||||
head -1 | \
|
||||
grep -oP 'trace_id=\K[a-f0-9]+')
|
||||
|
||||
# Query distributed tracing system
|
||||
# Jaeger
|
||||
curl "http://jaeger:16686/api/traces/$trace_id"
|
||||
|
||||
# Zipkin
|
||||
curl "http://zipkin:9411/api/v2/trace/$trace_id"
|
||||
```
|
||||
|
||||
### 6. Anomaly Detection
|
||||
|
||||
Identify unusual patterns:
|
||||
|
||||
**Statistical Anomalies**:
|
||||
```python
|
||||
import statistics
|
||||
from collections import defaultdict
|
||||
|
||||
# Analyze error rates per hour
|
||||
hourly_errors = defaultdict(int)
|
||||
|
||||
with open('logs/application.log') as f:
|
||||
for line in f:
|
||||
if 'ERROR' in line:
|
||||
# Extract hour
|
||||
hour = line[:13] # YYYY-MM-DD HH
|
||||
hourly_errors[hour] += 1
|
||||
|
||||
# Calculate statistics
|
||||
error_counts = list(hourly_errors.values())
|
||||
mean = statistics.mean(error_counts)
|
||||
stdev = statistics.stdev(error_counts)
|
||||
|
||||
# Find anomalies (>2 standard deviations)
|
||||
print("Anomalous hours (>2 std dev from mean):")
|
||||
for hour, count in sorted(hourly_errors.items()):
|
||||
z_score = (count - mean) / stdev
|
||||
if abs(z_score) > 2:
|
||||
print(f"{hour}: {count} errors (z-score: {z_score:.2f})")
|
||||
```
|
||||
|
||||
**New Error Types**:
|
||||
```bash
|
||||
# Compare today's errors with baseline
|
||||
grep "ERROR" logs/application.log.1 | \
|
||||
awk -F'ERROR' '{print $2}' | \
|
||||
sort -u > baseline_errors.txt
|
||||
|
||||
grep "ERROR" logs/application.log | \
|
||||
awk -F'ERROR' '{print $2}' | \
|
||||
sort -u > current_errors.txt
|
||||
|
||||
# Find new error types
|
||||
comm -13 baseline_errors.txt current_errors.txt > new_errors.txt
|
||||
|
||||
echo "New error types detected:"
|
||||
cat new_errors.txt
|
||||
```
|
||||
|
||||
**Spike Detection**:
|
||||
```python
|
||||
# Detect sudden spikes in error rate
|
||||
from collections import deque
|
||||
|
||||
def detect_spikes(values, window_size=10, threshold=3):
|
||||
"""Detect values that are >threshold times the rolling average"""
|
||||
window = deque(maxlen=window_size)
|
||||
spikes = []
|
||||
|
||||
for i, value in enumerate(values):
|
||||
if len(window) == window_size:
|
||||
avg = sum(window) / len(window)
|
||||
if value > avg * threshold:
|
||||
spikes.append((i, value, avg))
|
||||
|
||||
window.append(value)
|
||||
|
||||
return spikes
|
||||
|
||||
# Analyze minute-by-minute error counts
|
||||
minute_errors = {} # {minute: error_count}
|
||||
|
||||
with open('logs/application.log') as f:
|
||||
for line in f:
|
||||
if 'ERROR' in line:
|
||||
minute = line[:16] # YYYY-MM-DD HH:MM
|
||||
minute_errors[minute] = minute_errors.get(minute, 0) + 1
|
||||
|
||||
# Detect spikes
|
||||
error_counts = [minute_errors.get(m, 0) for m in sorted(minute_errors.keys())]
|
||||
spikes = detect_spikes(error_counts, window_size=10, threshold=3)
|
||||
|
||||
print("Error spikes detected:")
|
||||
for idx, value, avg in spikes:
|
||||
print(f"Minute {idx}: {value} errors (avg was {avg:.1f})")
|
||||
```
|
||||
|
||||
### 7. Performance Analysis
|
||||
|
||||
Analyze performance from logs:
|
||||
|
||||
**Slow Query Analysis**:
|
||||
```bash
|
||||
# PostgreSQL slow query log
|
||||
cat /var/log/postgresql/postgresql.log | \
|
||||
grep "duration:" | \
|
||||
awk '{print $13, $0}' | \
|
||||
sort -rn | \
|
||||
head -20
|
||||
|
||||
# Extract slow queries
|
||||
awk '/duration:/ && $13 > 1000 {print $0}' \
|
||||
/var/log/postgresql/postgresql.log
|
||||
```
|
||||
|
||||
**Endpoint Performance**:
|
||||
```bash
|
||||
# Average response time per endpoint
|
||||
awk '{endpoint[$7] += $10; count[$7]++}
|
||||
END {
|
||||
for (e in endpoint) {
|
||||
printf "%s: %.2fms\n", e, endpoint[e]/count[e]
|
||||
}
|
||||
}' /var/log/nginx/access.log | sort -t: -k2 -rn
|
||||
|
||||
# Slowest endpoints
|
||||
awk '{print $10, $7}' /var/log/nginx/access.log | \
|
||||
sort -rn | \
|
||||
head -20
|
||||
```
|
||||
|
||||
### 8. User Impact Analysis
|
||||
|
||||
Assess user-facing impact:
|
||||
|
||||
**Affected Users**:
|
||||
```bash
|
||||
# Extract unique users experiencing errors
|
||||
grep "ERROR" logs/application.log | \
|
||||
grep -oP 'user_id=\K[a-zA-Z0-9]+' | \
|
||||
sort -u | \
|
||||
wc -l
|
||||
|
||||
# Error rate by user
|
||||
grep "ERROR" logs/application.log | \
|
||||
grep -oP 'user_id=\K[a-zA-Z0-9]+' | \
|
||||
sort | uniq -c | sort -rn | head -20
|
||||
|
||||
# Users with most errors
|
||||
grep "user_id=" logs/application.log | \
|
||||
awk '{
|
||||
total[$0]++
|
||||
if (/ERROR/) errors[$0]++
|
||||
}
|
||||
END {
|
||||
for (user in total) {
|
||||
print user, errors[user]/total[user]*100"%"
|
||||
}
|
||||
}' | sort -t% -k2 -rn
|
||||
```
|
||||
|
||||
**Failed Requests**:
|
||||
```bash
|
||||
# 5xx errors
|
||||
grep " 5[0-9][0-9] " /var/log/nginx/access.log
|
||||
|
||||
# Failed endpoints
|
||||
awk '$9 >= 500 {print $7}' /var/log/nginx/access.log | \
|
||||
sort | uniq -c | sort -rn
|
||||
|
||||
# Failed request details
|
||||
awk '$9 >= 500 {print $4, $7, $9, $10}' \
|
||||
/var/log/nginx/access.log
|
||||
```
|
||||
|
||||
### 9. Resource Usage from Logs
|
||||
|
||||
Extract resource usage patterns:
|
||||
|
||||
**Memory Usage**:
|
||||
```bash
|
||||
# Extract memory logs
|
||||
grep -i "memory\|heap\|oom" logs/application.log
|
||||
|
||||
# Parse memory usage
|
||||
grep "heap_used" logs/application.log | \
|
||||
awk '{print $1, $2, $NF}' | \
|
||||
sed 's/MB$//'
|
||||
```
|
||||
|
||||
**Connection Pool**:
|
||||
```bash
|
||||
# Database connection logs
|
||||
grep "connection" logs/application.log | \
|
||||
grep -oP 'pool_size=\K\d+|active=\K\d+|idle=\K\d+'
|
||||
|
||||
# Connection exhaustion
|
||||
grep "connection.*timeout\|pool.*exhausted" logs/application.log -A 5
|
||||
```
|
||||
|
||||
### 10. Security Analysis
|
||||
|
||||
Look for security-related issues:
|
||||
|
||||
**Authentication Failures**:
|
||||
```bash
|
||||
# Failed login attempts
|
||||
grep -i "authentication.*failed\|login.*failed" logs/application.log
|
||||
|
||||
# By IP address
|
||||
grep "authentication.*failed" logs/application.log | \
|
||||
grep -oP 'ip=\K[\d.]+' | \
|
||||
sort | uniq -c | sort -rn
|
||||
|
||||
# Brute force detection
|
||||
grep "authentication.*failed" logs/application.log | \
|
||||
grep -oP 'ip=\K[\d.]+' | \
|
||||
uniq -c | \
|
||||
awk '$1 > 10 {print $2, $1 " attempts"}'
|
||||
```
|
||||
|
||||
**Suspicious Patterns**:
|
||||
```bash
|
||||
# SQL injection attempts
|
||||
grep -iE "union.*select|drop.*table|; --" /var/log/nginx/access.log
|
||||
|
||||
# Path traversal attempts
|
||||
grep -E "\.\./|\.\.%2F" /var/log/nginx/access.log
|
||||
|
||||
# XSS attempts
|
||||
grep -iE "<script|javascript:|onerror=" /var/log/nginx/access.log
|
||||
|
||||
# Command injection attempts
|
||||
grep -E ";\s*(cat|ls|wget|curl)" /var/log/nginx/access.log
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
```markdown
|
||||
# Log Analysis Report: [Issue/Time Period]
|
||||
|
||||
## Summary
|
||||
[High-level summary of findings]
|
||||
|
||||
## Analysis Period
|
||||
- **Start**: [start timestamp]
|
||||
- **End**: [end timestamp]
|
||||
- **Duration**: [duration]
|
||||
- **Log Sources**: [list of logs analyzed]
|
||||
- **Total Lines**: [number of log lines]
|
||||
|
||||
## Key Findings
|
||||
|
||||
### Error Analysis
|
||||
- **Total Errors**: [count]
|
||||
- **Error Rate**: [percentage]%
|
||||
- **Error Types**: [number of unique error types]
|
||||
- **Most Common Error**: [error type] ([count] occurrences)
|
||||
|
||||
### Top Errors
|
||||
|
||||
1. **[Error Type 1]** - [count] occurrences
|
||||
```
|
||||
[sample log line]
|
||||
```
|
||||
- First seen: [timestamp]
|
||||
- Last seen: [timestamp]
|
||||
- Peak: [timestamp with highest frequency]
|
||||
|
||||
2. **[Error Type 2]** - [count] occurrences
|
||||
```
|
||||
[sample log line]
|
||||
```
|
||||
- [similar details]
|
||||
|
||||
### Patterns Detected
|
||||
|
||||
#### Pattern 1: [Pattern Name]
|
||||
- **Description**: [what pattern is]
|
||||
- **Frequency**: [how often it occurs]
|
||||
- **Impact**: [user/system impact]
|
||||
- **Example**:
|
||||
```
|
||||
[log excerpt showing pattern]
|
||||
```
|
||||
|
||||
#### Pattern 2: [Pattern Name]
|
||||
[similar structure]
|
||||
|
||||
## Timeline Analysis
|
||||
|
||||
### Critical Events Timeline
|
||||
|
||||
\`\`\`
|
||||
14:25:30 [APP] Normal operation, avg response time 50ms
|
||||
14:28:45 [APP] Response time increasing to 150ms
|
||||
14:29:10 [DB] Connection pool usage at 90%
|
||||
14:29:30 [APP] First timeout errors appear
|
||||
14:29:45 [DB] Connection pool exhausted
|
||||
14:30:00 [APP] Error rate spikes to 25%
|
||||
14:30:15 [APP] Circuit breaker opens
|
||||
14:30:30 [OPS] Auto-scaling triggers
|
||||
14:32:00 [APP] New instances online
|
||||
14:33:00 [APP] Error rate decreases to 5%
|
||||
14:35:00 [APP] Full recovery, normal operation
|
||||
\`\`\`
|
||||
|
||||
### Event Correlation
|
||||
|
||||
**Root Event**: Database connection pool exhaustion at 14:29:45
|
||||
|
||||
**Contributing Factors**:
|
||||
- High traffic spike (+300% at 14:28:00)
|
||||
- Long-running queries (>5s queries detected)
|
||||
- Insufficient connection pool size (max: 20)
|
||||
|
||||
**Cascading Effects**:
|
||||
- API timeouts (starting 14:29:30)
|
||||
- Cache misses due to timeouts
|
||||
- Increased load from retries
|
||||
- Circuit breaker activation
|
||||
|
||||
## Request Tracing
|
||||
|
||||
### Example Failed Request
|
||||
|
||||
**Request ID**: req_abc123def456
|
||||
|
||||
**Timeline**:
|
||||
\`\`\`
|
||||
14:30:15.123 [NGINX] Request received: POST /api/orders
|
||||
14:30:15.125 [APP] Request processing started
|
||||
14:30:15.130 [APP] Database query started: SELECT orders...
|
||||
14:30:20.131 [DB] Query timeout after 5s
|
||||
14:30:20.135 [APP] Error: Database timeout
|
||||
14:30:20.137 [APP] Response: 500 Internal Server Error
|
||||
14:30:20.140 [NGINX] Response sent (5017ms)
|
||||
\`\`\`
|
||||
|
||||
**User Impact**: Order creation failed for user_123
|
||||
|
||||
## Anomalies Detected
|
||||
|
||||
### Anomaly 1: Error Rate Spike
|
||||
- **Time**: 14:30:00 - 14:35:00
|
||||
- **Severity**: High
|
||||
- **Details**: Error rate jumped from 0.1% to 25%
|
||||
- **Affected Users**: ~500 users
|
||||
- **Root Cause**: Database connection pool exhaustion
|
||||
|
||||
### Anomaly 2: New Error Type
|
||||
- **Error**: "ConnectionPoolExhausted"
|
||||
- **First Seen**: 14:29:45
|
||||
- **Frequency**: 1,234 occurrences in 5 minutes
|
||||
- **Status**: Previously unseen in baseline
|
||||
|
||||
## Performance Analysis
|
||||
|
||||
### Response Time Statistics
|
||||
- **Average**: 150ms (baseline: 50ms)
|
||||
- **P50**: 80ms
|
||||
- **P95**: 500ms
|
||||
- **P99**: 2000ms
|
||||
- **Max**: 5000ms
|
||||
|
||||
### Slowest Endpoints
|
||||
1. `/api/orders` - avg 450ms (1,200 requests)
|
||||
2. `/api/users/profile` - avg 380ms (800 requests)
|
||||
3. `/api/reports` - avg 320ms (200 requests)
|
||||
|
||||
### Database Performance
|
||||
- **Slow Queries**: 45 queries >1s
|
||||
- **Slowest Query**: 5.2s (SELECT with missing index)
|
||||
- **Average Query Time**: 85ms (baseline: 25ms)
|
||||
|
||||
## User Impact
|
||||
|
||||
### Affected Users
|
||||
- **Total Affected**: ~500 users
|
||||
- **Error Rate by User Type**:
|
||||
- Premium users: 5% error rate
|
||||
- Free users: 30% error rate
|
||||
- **Most Affected User**: user_789 (25 errors)
|
||||
|
||||
### Failed Operations
|
||||
- **Order Creation**: 234 failures
|
||||
- **Payment Processing**: 89 failures
|
||||
- **Profile Updates**: 45 failures
|
||||
|
||||
## Resource Analysis
|
||||
|
||||
### Connection Pool
|
||||
- **Max Size**: 20 connections
|
||||
- **Peak Usage**: 20/20 (100%)
|
||||
- **Average Wait Time**: 2.5s
|
||||
- **Recommendation**: Increase to 50 connections
|
||||
|
||||
### Memory Usage
|
||||
- **Average**: 450MB
|
||||
- **Peak**: 890MB
|
||||
- **Trend**: Stable (no leak detected)
|
||||
|
||||
## Security Findings
|
||||
|
||||
### Authentication
|
||||
- **Failed Logins**: 12
|
||||
- **Suspicious IPs**: 2 IPs with >5 failed attempts
|
||||
- **Brute Force Attempts**: None detected
|
||||
|
||||
### Attack Patterns
|
||||
- **SQL Injection Attempts**: 0
|
||||
- **XSS Attempts**: 0
|
||||
- **Path Traversal**: 0
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
Based on log analysis:
|
||||
|
||||
**Primary Cause**: Database connection pool too small for traffic volume
|
||||
|
||||
**Contributing Factors**:
|
||||
1. Traffic spike (+300%)
|
||||
2. Slow queries consuming connections
|
||||
3. No connection timeout configured
|
||||
|
||||
**Evidence**:
|
||||
- Connection pool exhausted at 14:29:45
|
||||
- Immediate correlation with error spike
|
||||
- Recovery after auto-scaling added capacity
|
||||
|
||||
## Recommendations
|
||||
|
||||
### Immediate Actions
|
||||
1. Increase database connection pool to 50
|
||||
2. Add connection timeout (30s)
|
||||
3. Optimize slow queries identified
|
||||
|
||||
### Monitoring Improvements
|
||||
1. Alert on connection pool usage >80%
|
||||
2. Track query duration P95
|
||||
3. Monitor error rate per endpoint
|
||||
|
||||
### Code Changes
|
||||
1. Add query timeouts to all database calls
|
||||
2. Implement connection retry logic
|
||||
3. Add circuit breaker for database calls
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Fix**: Use `/debug fix` to implement connection pool increase
|
||||
2. **Performance**: Use `/debug performance` to optimize slow queries
|
||||
3. **Monitoring**: Add alerts for connection pool usage
|
||||
|
||||
## Appendices
|
||||
|
||||
### A. Full Error Log Excerpt
|
||||
\`\`\`
|
||||
[Relevant log excerpts]
|
||||
\`\`\`
|
||||
|
||||
### B. Query Performance Data
|
||||
\`\`\`sql
|
||||
[Slow query details]
|
||||
\`\`\`
|
||||
|
||||
### C. Traffic Pattern Graph
|
||||
\`\`\`
|
||||
[ASCII graph or description of traffic pattern]
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Logs Not Found**:
|
||||
If specified log files don't exist:
|
||||
1. List available log files
|
||||
2. Suggest alternative log locations
|
||||
3. Provide commands to locate logs
|
||||
|
||||
**Logs Too Large**:
|
||||
If logs are too large to analyze:
|
||||
1. Focus on most recent data
|
||||
2. Use sampling techniques
|
||||
3. Analyze specific time windows
|
||||
4. Suggest log aggregation tools
|
||||
|
||||
**Insufficient Context**:
|
||||
If logs lack necessary information:
|
||||
1. Document what information is missing
|
||||
2. Suggest additional logging
|
||||
3. Recommend structured logging format
|
||||
4. Propose log enrichment strategies
|
||||
|
||||
## Integration with Other Operations
|
||||
|
||||
- **Before**: Use `/debug diagnose` to identify time period to analyze
|
||||
- **After**: Use `/debug fix` to address issues found in logs
|
||||
- **Related**: Use `/debug performance` for performance issues
|
||||
- **Related**: Use `/debug reproduce` to recreate issues found in logs
|
||||
|
||||
## Agent Utilization
|
||||
|
||||
This operation leverages the **10x-fullstack-engineer** agent for:
|
||||
- Pattern recognition across large log volumes
|
||||
- Correlating events across multiple log sources
|
||||
- Statistical analysis and anomaly detection
|
||||
- Root cause inference from log patterns
|
||||
- Actionable recommendations based on findings
|
||||
759
commands/debug/diagnose.md
Normal file
759
commands/debug/diagnose.md
Normal file
@@ -0,0 +1,759 @@
|
||||
# Diagnose Operation - Comprehensive Diagnosis and Root Cause Analysis
|
||||
|
||||
You are executing the **diagnose** operation to perform comprehensive diagnosis and root cause analysis for complex issues spanning multiple layers of the application stack.
|
||||
|
||||
## Parameters
|
||||
|
||||
**Received**: `$ARGUMENTS` (after removing 'diagnose' operation name)
|
||||
|
||||
Expected format: `issue:"problem description" [environment:"prod|staging|dev"] [logs:"log-location"] [reproduction:"steps"] [impact:"severity"]`
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Issue Understanding
|
||||
|
||||
Gather and analyze comprehensive information about the issue:
|
||||
|
||||
**Information to Collect**:
|
||||
- **Symptom**: What is the observable problem? What exactly is failing?
|
||||
- **Impact**: Who is affected? How many users? Business impact?
|
||||
- **Frequency**: Consistent, intermittent, or rare? Percentage of occurrences?
|
||||
- **Environment**: Production, staging, or development? Specific regions/zones?
|
||||
- **Timeline**: When did it start? Any correlation with deployments?
|
||||
- **Recent Changes**: Deployments, config changes, infrastructure changes?
|
||||
- **Error Messages**: Complete error messages, stack traces, error codes
|
||||
|
||||
**Questions to Answer**:
|
||||
```markdown
|
||||
- What is the user experiencing?
|
||||
- What should be happening instead?
|
||||
- How widespread is the issue?
|
||||
- Is it getting worse over time?
|
||||
- Are there any patterns (time of day, user types, specific actions)?
|
||||
```
|
||||
|
||||
### 2. Data Collection Across All Layers
|
||||
|
||||
Systematically collect diagnostic data from each layer of the stack:
|
||||
|
||||
#### Frontend Diagnostics
|
||||
|
||||
**Browser Console Analysis**:
|
||||
```javascript
|
||||
// Check for JavaScript errors
|
||||
console.error logs
|
||||
console.warn logs
|
||||
|
||||
// Inspect unhandled promise rejections
|
||||
window.addEventListener('unhandledrejection', event => {
|
||||
console.error('Unhandled promise rejection:', event.reason);
|
||||
});
|
||||
|
||||
// Check for resource loading failures
|
||||
performance.getEntriesByType('resource').filter(r => r.transferSize === 0)
|
||||
```
|
||||
|
||||
**Network Request Analysis**:
|
||||
```javascript
|
||||
// Analyze failed requests
|
||||
// Open DevTools > Network tab
|
||||
// Filter: Status code 4xx, 5xx
|
||||
// Check: Request headers, payload, response body, timing
|
||||
|
||||
// Performance timing
|
||||
const perfEntries = performance.getEntriesByType('navigation')[0];
|
||||
console.log('DNS lookup:', perfEntries.domainLookupEnd - perfEntries.domainLookupStart);
|
||||
console.log('TCP connection:', perfEntries.connectEnd - perfEntries.connectStart);
|
||||
console.log('Request time:', perfEntries.responseStart - perfEntries.requestStart);
|
||||
console.log('Response time:', perfEntries.responseEnd - perfEntries.responseStart);
|
||||
```
|
||||
|
||||
**State Inspection**:
|
||||
```javascript
|
||||
// React DevTools: Component state at error time
|
||||
// Redux DevTools: Action history, state snapshots
|
||||
// Vue DevTools: Vuex state, component hierarchy
|
||||
|
||||
// Add error boundary to capture React errors
|
||||
class ErrorBoundary extends React.Component {
|
||||
componentDidCatch(error, errorInfo) {
|
||||
console.error('Component error:', {
|
||||
error: error.toString(),
|
||||
componentStack: errorInfo.componentStack,
|
||||
currentState: this.props.reduxState
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Backend Diagnostics
|
||||
|
||||
**Application Logs**:
|
||||
```bash
|
||||
# Real-time application logs
|
||||
tail -f logs/application.log
|
||||
|
||||
# Error logs with context
|
||||
grep -i "error\|exception\|fatal" logs/*.log -A 10 -B 5
|
||||
|
||||
# Filter by request ID to trace single request
|
||||
grep "request-id-12345" logs/*.log
|
||||
|
||||
# Find patterns in errors
|
||||
awk '/ERROR/ {print $0}' logs/application.log | sort | uniq -c | sort -rn
|
||||
|
||||
# Time-based analysis
|
||||
grep "2024-10-14 14:" logs/application.log | grep ERROR
|
||||
```
|
||||
|
||||
**System Logs**:
|
||||
```bash
|
||||
# Service logs (systemd)
|
||||
journalctl -u application-service.service -f
|
||||
journalctl -u application-service.service --since "1 hour ago"
|
||||
|
||||
# Syslog
|
||||
tail -f /var/log/syslog | grep application
|
||||
|
||||
# Kernel logs (for system-level issues)
|
||||
dmesg -T | tail -50
|
||||
```
|
||||
|
||||
**Application Metrics**:
|
||||
```bash
|
||||
# Request rate and response times
|
||||
# Check APM tools: New Relic, Datadog, Elastic APM
|
||||
|
||||
# HTTP response codes over time
|
||||
awk '{print $9}' /var/log/nginx/access.log | sort | uniq -c
|
||||
|
||||
# Slow requests
|
||||
awk '$10 > 1000 {print $0}' /var/log/nginx/access.log
|
||||
|
||||
# Error rate calculation
|
||||
errors=$(grep -c "ERROR" logs/application.log)
|
||||
total=$(wc -l < logs/application.log)
|
||||
echo "Error rate: $(echo "scale=4; $errors / $total * 100" | bc)%"
|
||||
```
|
||||
|
||||
#### Database Diagnostics
|
||||
|
||||
**Active Queries and Locks**:
|
||||
```sql
|
||||
-- PostgreSQL: Active queries
|
||||
SELECT
|
||||
pid,
|
||||
now() - query_start AS duration,
|
||||
state,
|
||||
query
|
||||
FROM pg_stat_activity
|
||||
WHERE state != 'idle'
|
||||
ORDER BY duration DESC;
|
||||
|
||||
-- Long-running queries
|
||||
SELECT
|
||||
pid,
|
||||
now() - query_start AS duration,
|
||||
query
|
||||
FROM pg_stat_activity
|
||||
WHERE state = 'active'
|
||||
AND now() - query_start > interval '1 minute';
|
||||
|
||||
-- Blocking queries
|
||||
SELECT
|
||||
blocked_locks.pid AS blocked_pid,
|
||||
blocked_activity.usename AS blocked_user,
|
||||
blocking_locks.pid AS blocking_pid,
|
||||
blocking_activity.usename AS blocking_user,
|
||||
blocked_activity.query AS blocked_statement,
|
||||
blocking_activity.query AS blocking_statement
|
||||
FROM pg_catalog.pg_locks blocked_locks
|
||||
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
|
||||
JOIN pg_catalog.pg_locks blocking_locks
|
||||
ON blocking_locks.locktype = blocked_locks.locktype
|
||||
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
|
||||
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
|
||||
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
|
||||
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
|
||||
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
|
||||
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
|
||||
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
|
||||
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
|
||||
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
|
||||
AND blocking_locks.pid != blocked_locks.pid
|
||||
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
|
||||
WHERE NOT blocked_locks.granted;
|
||||
|
||||
-- Deadlock information (from logs)
|
||||
-- Look for "deadlock detected" in PostgreSQL logs
|
||||
```
|
||||
|
||||
**Database Performance**:
|
||||
```sql
|
||||
-- Table statistics
|
||||
SELECT
|
||||
schemaname,
|
||||
tablename,
|
||||
n_live_tup AS live_rows,
|
||||
n_dead_tup AS dead_rows,
|
||||
last_vacuum,
|
||||
last_autovacuum
|
||||
FROM pg_stat_user_tables
|
||||
ORDER BY n_dead_tup DESC;
|
||||
|
||||
-- Index usage
|
||||
SELECT
|
||||
schemaname,
|
||||
tablename,
|
||||
indexname,
|
||||
idx_scan,
|
||||
idx_tup_read,
|
||||
idx_tup_fetch
|
||||
FROM pg_stat_user_indexes
|
||||
ORDER BY idx_scan ASC;
|
||||
|
||||
-- Connection count
|
||||
SELECT
|
||||
count(*) AS connections,
|
||||
state,
|
||||
usename
|
||||
FROM pg_stat_activity
|
||||
GROUP BY state, usename;
|
||||
|
||||
-- Cache hit ratio
|
||||
SELECT
|
||||
sum(heap_blks_read) AS heap_read,
|
||||
sum(heap_blks_hit) AS heap_hit,
|
||||
sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) AS cache_hit_ratio
|
||||
FROM pg_statio_user_tables;
|
||||
```
|
||||
|
||||
**Slow Query Log Analysis**:
|
||||
```bash
|
||||
# PostgreSQL: Enable log_min_duration_statement
|
||||
# Check postgresql.conf: log_min_duration_statement = 1000 (1 second)
|
||||
|
||||
# Analyze slow queries
|
||||
grep "duration:" /var/log/postgresql/postgresql.log | awk '{print $3, $6}' | sort -rn | head -20
|
||||
```
|
||||
|
||||
#### Infrastructure Diagnostics
|
||||
|
||||
**Resource Usage**:
|
||||
```bash
|
||||
# CPU usage
|
||||
top -bn1 | head -20
|
||||
mpstat 1 5 # CPU stats every 1 second, 5 times
|
||||
|
||||
# Memory usage
|
||||
free -h
|
||||
vmstat 1 5
|
||||
|
||||
# Disk I/O
|
||||
iostat -x 1 5
|
||||
iotop -o # Only show processes doing I/O
|
||||
|
||||
# Disk space
|
||||
df -h
|
||||
du -sh /* | sort -rh | head -10
|
||||
|
||||
# Network connections
|
||||
netstat -an | grep ESTABLISHED | wc -l
|
||||
ss -s # Socket statistics
|
||||
|
||||
# Open files
|
||||
lsof | wc -l
|
||||
lsof -u application-user | wc -l
|
||||
```
|
||||
|
||||
**Container Diagnostics (Docker/Kubernetes)**:
|
||||
```bash
|
||||
# Docker container logs
|
||||
docker logs container-name --tail 100 -f
|
||||
docker stats container-name
|
||||
|
||||
# Docker container inspection
|
||||
docker inspect container-name
|
||||
docker exec container-name ps aux
|
||||
docker exec container-name df -h
|
||||
|
||||
# Kubernetes pod logs
|
||||
kubectl logs pod-name -f
|
||||
kubectl logs pod-name --previous # Previous container logs
|
||||
|
||||
# Kubernetes pod resource usage
|
||||
kubectl top pods
|
||||
kubectl describe pod pod-name
|
||||
|
||||
# Kubernetes events
|
||||
kubectl get events --sort-by='.lastTimestamp'
|
||||
```
|
||||
|
||||
**Cloud Provider Metrics**:
|
||||
```bash
|
||||
# AWS CloudWatch
|
||||
aws cloudwatch get-metric-statistics \
|
||||
--namespace AWS/EC2 \
|
||||
--metric-name CPUUtilization \
|
||||
--dimensions Name=InstanceId,Value=i-1234567890abcdef0 \
|
||||
--start-time 2024-10-14T00:00:00Z \
|
||||
--end-time 2024-10-14T23:59:59Z \
|
||||
--period 3600 \
|
||||
--statistics Average
|
||||
|
||||
# Check application logs
|
||||
aws logs tail /aws/application/logs --follow
|
||||
|
||||
# GCP Stackdriver
|
||||
gcloud logging read "resource.type=gce_instance AND severity>=ERROR" --limit 50
|
||||
|
||||
# Azure Monitor
|
||||
az monitor metrics list --resource <resource-id> --metric "Percentage CPU"
|
||||
```
|
||||
|
||||
### 3. Hypothesis Formation
|
||||
|
||||
Based on collected data, form testable hypotheses about the root cause:
|
||||
|
||||
**Common Issue Patterns to Consider**:
|
||||
|
||||
#### Race Conditions
|
||||
**Symptoms**:
|
||||
- Intermittent failures
|
||||
- Works sometimes, fails other times
|
||||
- Timing-dependent behavior
|
||||
- "Cannot read property of undefined" on objects that should exist
|
||||
|
||||
**What to Check**:
|
||||
```javascript
|
||||
// Look for async operations without proper waiting
|
||||
async function problematic() {
|
||||
let data;
|
||||
fetchData().then(result => data = result); // ❌ Race condition
|
||||
return processData(data); // May execute before data is set
|
||||
}
|
||||
|
||||
// Proper async/await
|
||||
async function correct() {
|
||||
const data = await fetchData(); // ✅ Wait for data
|
||||
return processData(data);
|
||||
}
|
||||
|
||||
// Multiple parallel operations
|
||||
Promise.all([op1(), op2(), op3()]) // Check for interdependencies
|
||||
```
|
||||
|
||||
#### Memory Leaks
|
||||
**Symptoms**:
|
||||
- Degrading performance over time
|
||||
- Increasing memory usage
|
||||
- Eventually crashes with OOM errors
|
||||
- Slow garbage collection
|
||||
|
||||
**What to Check**:
|
||||
```javascript
|
||||
// Event listeners not removed
|
||||
componentDidMount() {
|
||||
window.addEventListener('resize', this.handleResize);
|
||||
// ❌ Missing removeEventListener in componentWillUnmount
|
||||
}
|
||||
|
||||
// Closures holding references
|
||||
function createLeak() {
|
||||
const largeData = new Array(1000000);
|
||||
return () => console.log(largeData[0]); // Holds entire array
|
||||
}
|
||||
|
||||
// Timers not cleared
|
||||
setInterval(() => fetchData(), 1000); // ❌ Never cleared
|
||||
|
||||
// Cache without eviction
|
||||
const cache = {};
|
||||
cache[key] = value; // ❌ Grows indefinitely
|
||||
```
|
||||
|
||||
#### Database Issues
|
||||
**Symptoms**:
|
||||
- Slow queries
|
||||
- Timeouts
|
||||
- Deadlocks
|
||||
- Connection pool exhausted
|
||||
|
||||
**What to Check**:
|
||||
```sql
|
||||
-- Missing indexes
|
||||
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'user@example.com';
|
||||
-- Look for "Seq Scan" on large tables
|
||||
|
||||
-- N+1 queries
|
||||
-- Check if ORM is making one query per item in a loop
|
||||
|
||||
-- Long transactions
|
||||
-- Find transactions open for extended periods
|
||||
|
||||
-- Lock contention
|
||||
-- Check for blocking queries and deadlocks
|
||||
```
|
||||
|
||||
#### Network Issues
|
||||
**Symptoms**:
|
||||
- Timeouts
|
||||
- Intermittent connectivity
|
||||
- DNS resolution failures
|
||||
- SSL/TLS handshake errors
|
||||
|
||||
**What to Check**:
|
||||
```bash
|
||||
# DNS resolution
|
||||
dig api.example.com
|
||||
nslookup api.example.com
|
||||
|
||||
# Network latency
|
||||
ping api.example.com
|
||||
traceroute api.example.com
|
||||
|
||||
# TCP connection
|
||||
telnet api.example.com 443
|
||||
nc -zv api.example.com 443
|
||||
|
||||
# SSL/TLS verification
|
||||
openssl s_client -connect api.example.com:443 -servername api.example.com
|
||||
```
|
||||
|
||||
#### Authentication/Authorization
|
||||
**Symptoms**:
|
||||
- 401 Unauthorized errors
|
||||
- 403 Forbidden errors
|
||||
- Intermittent authentication failures
|
||||
- Session expired errors
|
||||
|
||||
**What to Check**:
|
||||
```javascript
|
||||
// Token expiration
|
||||
const token = jwt.decode(authToken);
|
||||
console.log('Token expires:', new Date(token.exp * 1000));
|
||||
|
||||
// Session state
|
||||
console.log('Session:', sessionStorage, localStorage);
|
||||
|
||||
// Cookie issues
|
||||
console.log('Cookies:', document.cookie);
|
||||
|
||||
// CORS issues (browser console)
|
||||
// Look for: "CORS policy: No 'Access-Control-Allow-Origin' header"
|
||||
```
|
||||
|
||||
#### Configuration Issues
|
||||
**Symptoms**:
|
||||
- Works locally, fails in environment
|
||||
- "Environment variable not set" errors
|
||||
- Connection refused errors
|
||||
- Permission denied errors
|
||||
|
||||
**What to Check**:
|
||||
```bash
|
||||
# Environment variables
|
||||
printenv | grep APPLICATION
|
||||
env | sort
|
||||
|
||||
# Configuration files
|
||||
cat config/production.json
|
||||
diff config/development.json config/production.json
|
||||
|
||||
# File permissions
|
||||
ls -la config/
|
||||
ls -la /var/application/
|
||||
|
||||
# Network configuration
|
||||
cat /etc/hosts
|
||||
cat /etc/resolv.conf
|
||||
```
|
||||
|
||||
### 4. Hypothesis Testing
|
||||
|
||||
Systematically test each hypothesis:
|
||||
|
||||
**Testing Strategy**:
|
||||
|
||||
1. **Isolation**: Test each component in isolation
|
||||
2. **Instrumentation**: Add detailed logging around suspected areas
|
||||
3. **Reproduction**: Create minimal reproduction case
|
||||
4. **Elimination**: Rule out hypotheses systematically
|
||||
|
||||
**Add Diagnostic Instrumentation**:
|
||||
```javascript
|
||||
// Detailed logging with context
|
||||
console.log('[DIAG] Before operation:', {
|
||||
timestamp: new Date().toISOString(),
|
||||
user: currentUser,
|
||||
state: JSON.stringify(currentState),
|
||||
params: params
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await operation(params);
|
||||
console.log('[DIAG] Operation success:', {
|
||||
timestamp: new Date().toISOString(),
|
||||
result: result,
|
||||
duration: Date.now() - startTime
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('[DIAG] Operation failed:', {
|
||||
timestamp: new Date().toISOString(),
|
||||
error: error.message,
|
||||
stack: error.stack,
|
||||
context: { user, state, params }
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Performance timing
|
||||
console.time('operation');
|
||||
await operation();
|
||||
console.timeEnd('operation');
|
||||
|
||||
// Memory usage tracking
|
||||
if (global.gc) {
|
||||
global.gc();
|
||||
const usage = process.memoryUsage();
|
||||
console.log('[MEMORY]', {
|
||||
heapUsed: Math.round(usage.heapUsed / 1024 / 1024) + 'MB',
|
||||
heapTotal: Math.round(usage.heapTotal / 1024 / 1024) + 'MB',
|
||||
external: Math.round(usage.external / 1024 / 1024) + 'MB'
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
**Binary Search Debugging**:
|
||||
```javascript
|
||||
// Comment out half the code
|
||||
// Determine which half has the bug
|
||||
// Repeat until isolated
|
||||
|
||||
// Example: Large function with error
|
||||
function complexOperation() {
|
||||
// Part 1: Data fetching
|
||||
const data = fetchData();
|
||||
|
||||
// Part 2: Data processing
|
||||
const processed = processData(data);
|
||||
|
||||
// Part 3: Data validation
|
||||
const validated = validateData(processed);
|
||||
|
||||
// Part 4: Data saving
|
||||
return saveData(validated);
|
||||
}
|
||||
|
||||
// Test each part independently
|
||||
const data = fetchData();
|
||||
console.log('[TEST] Data fetched:', data); // ✅ Works
|
||||
|
||||
const processed = processData(testData);
|
||||
console.log('[TEST] Data processed:', processed); // ❌ Fails here
|
||||
// Now investigate processData() specifically
|
||||
```
|
||||
|
||||
### 5. Root Cause Identification
|
||||
|
||||
Once hypotheses are tested and narrowed down:
|
||||
|
||||
**Confirm Root Cause**:
|
||||
1. Can you consistently reproduce the issue?
|
||||
2. Does fixing this cause resolve the symptom?
|
||||
3. Are there other instances of the same issue?
|
||||
4. Does the fix have any side effects?
|
||||
|
||||
**Document Evidence**:
|
||||
- Specific code/config that causes the issue
|
||||
- Exact conditions required for issue to manifest
|
||||
- Why this causes the observed symptom
|
||||
- Related code that might have same issue
|
||||
|
||||
### 6. Impact Assessment
|
||||
|
||||
Evaluate the full impact:
|
||||
|
||||
**User Impact**:
|
||||
- Number of users affected
|
||||
- Severity of impact (blocking, degraded, minor)
|
||||
- User actions affected
|
||||
- Business metrics impacted
|
||||
|
||||
**System Impact**:
|
||||
- Performance degradation
|
||||
- Resource consumption
|
||||
- Downstream service effects
|
||||
- Data integrity concerns
|
||||
|
||||
**Risk Assessment**:
|
||||
- Can it cause data loss?
|
||||
- Can it cause security issues?
|
||||
- Can it cause cascading failures?
|
||||
- Is it getting worse over time?
|
||||
|
||||
## Output Format
|
||||
|
||||
```markdown
|
||||
# Diagnosis Report: [Issue Summary]
|
||||
|
||||
## Executive Summary
|
||||
[One-paragraph summary of issue, root cause, and recommended action]
|
||||
|
||||
## Issue Description
|
||||
|
||||
### Symptoms
|
||||
- [Observable symptom 1]
|
||||
- [Observable symptom 2]
|
||||
- [Observable symptom 3]
|
||||
|
||||
### Impact
|
||||
- **Affected Users**: [number/percentage of users]
|
||||
- **Severity**: [critical|high|medium|low]
|
||||
- **Frequency**: [always|often|sometimes|rarely - with percentage]
|
||||
- **Business Impact**: [revenue loss, user experience, etc.]
|
||||
|
||||
### Environment
|
||||
- **Environment**: [production|staging|development]
|
||||
- **Version**: [application version]
|
||||
- **Infrastructure**: [relevant infrastructure details]
|
||||
- **Region**: [if applicable]
|
||||
|
||||
### Timeline
|
||||
- **First Observed**: [date/time]
|
||||
- **Recent Changes**: [deployments, config changes]
|
||||
- **Pattern**: [time-based, load-based, user-based]
|
||||
|
||||
## Diagnostic Data Collected
|
||||
|
||||
### Frontend Analysis
|
||||
[Console errors, network requests, performance data, state inspection results]
|
||||
|
||||
### Backend Analysis
|
||||
[Application logs, error traces, system metrics, request patterns]
|
||||
|
||||
### Database Analysis
|
||||
[Query logs, lock information, performance metrics, connection pool status]
|
||||
|
||||
### Infrastructure Analysis
|
||||
[Resource usage, container logs, cloud metrics, network diagnostics]
|
||||
|
||||
## Hypothesis Analysis
|
||||
|
||||
### Hypotheses Considered
|
||||
1. **[Hypothesis 1]**: [Description]
|
||||
- **Evidence For**: [supporting evidence]
|
||||
- **Evidence Against**: [contradicting evidence]
|
||||
- **Conclusion**: [Ruled out|Confirmed|Needs more investigation]
|
||||
|
||||
2. **[Hypothesis 2]**: [Description]
|
||||
- **Evidence For**: [supporting evidence]
|
||||
- **Evidence Against**: [contradicting evidence]
|
||||
- **Conclusion**: [Ruled out|Confirmed|Needs more investigation]
|
||||
|
||||
3. **[Hypothesis 3]**: [Description]
|
||||
- **Evidence For**: [supporting evidence]
|
||||
- **Evidence Against**: [contradicting evidence]
|
||||
- **Conclusion**: [Ruled out|Confirmed|Needs more investigation]
|
||||
|
||||
## Root Cause
|
||||
|
||||
### Root Cause Identified
|
||||
[Detailed explanation of the root cause with specific code/config references]
|
||||
|
||||
### Why It Causes the Symptom
|
||||
[Technical explanation of how the root cause leads to the observed behavior]
|
||||
|
||||
### Why It Wasn't Caught Earlier
|
||||
[Explanation of why tests/monitoring didn't catch this]
|
||||
|
||||
### Related Issues
|
||||
[Any similar issues that might exist or could be fixed with similar approach]
|
||||
|
||||
## Evidence
|
||||
|
||||
### Code/Configuration
|
||||
```[language]
|
||||
[Specific code or configuration causing the issue]
|
||||
```
|
||||
|
||||
### Reproduction
|
||||
[Exact steps to reproduce the issue consistently]
|
||||
|
||||
### Verification
|
||||
[Steps taken to confirm this is the root cause]
|
||||
|
||||
## Recommended Actions
|
||||
|
||||
### Immediate Actions
|
||||
1. [Immediate action 1 - e.g., rollback, circuit breaker]
|
||||
2. [Immediate action 2]
|
||||
|
||||
### Permanent Fix
|
||||
[Description of the permanent fix needed]
|
||||
|
||||
### Prevention
|
||||
- **Monitoring**: [What monitoring to add]
|
||||
- **Testing**: [What tests to add]
|
||||
- **Code Review**: [What to look for in code reviews]
|
||||
- **Documentation**: [What to document]
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Fix Implementation**: [Use /debug fix operation]
|
||||
2. **Verification**: [Testing strategy]
|
||||
3. **Deployment**: [Rollout plan]
|
||||
4. **Monitoring**: [What to watch]
|
||||
|
||||
## Appendices
|
||||
|
||||
### A. Detailed Logs
|
||||
[Relevant log excerpts with context]
|
||||
|
||||
### B. Metrics and Graphs
|
||||
[Performance metrics, error rates, resource usage]
|
||||
|
||||
### C. Related Tickets
|
||||
[Links to related issues or tickets]
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Insufficient Information**:
|
||||
If diagnosis cannot be completed due to missing information:
|
||||
1. List specific information needed
|
||||
2. Explain why each piece is important
|
||||
3. Provide instructions for gathering data
|
||||
4. Suggest interim monitoring
|
||||
|
||||
**Cannot Reproduce**:
|
||||
If issue cannot be reproduced:
|
||||
1. Document reproduction attempts
|
||||
2. Request more detailed reproduction steps
|
||||
3. Suggest environment comparison
|
||||
4. Propose production debugging approach
|
||||
|
||||
**Multiple Root Causes**:
|
||||
If multiple root causes are identified:
|
||||
1. Prioritize by impact
|
||||
2. Explain interdependencies
|
||||
3. Provide fix sequence
|
||||
4. Suggest monitoring between fixes
|
||||
|
||||
## Integration with Other Operations
|
||||
|
||||
After diagnosis is complete:
|
||||
- **For fixes**: Use `/debug fix` with identified root cause
|
||||
- **For reproduction**: Use `/debug reproduce` to create reliable test case
|
||||
- **For log analysis**: Use `/debug analyze-logs` for deeper log investigation
|
||||
- **For performance**: Use `/debug performance` if performance-related
|
||||
- **For memory**: Use `/debug memory` if memory-related
|
||||
|
||||
## Agent Utilization
|
||||
|
||||
This operation leverages the **10x-fullstack-engineer** agent for:
|
||||
- Systematic cross-layer analysis
|
||||
- Pattern recognition across stack
|
||||
- Hypothesis formation and testing
|
||||
- Production debugging expertise
|
||||
- Prevention-focused thinking
|
||||
967
commands/debug/fix.md
Normal file
967
commands/debug/fix.md
Normal file
@@ -0,0 +1,967 @@
|
||||
# Fix Operation - Targeted Fix Implementation
|
||||
|
||||
You are executing the **fix** operation to implement targeted fixes with comprehensive verification and prevention measures.
|
||||
|
||||
## Parameters
|
||||
|
||||
**Received**: `$ARGUMENTS` (after removing 'fix' operation name)
|
||||
|
||||
Expected format: `issue:"problem description" root_cause:"identified-cause" [verification:"test-strategy"] [scope:"affected-areas"] [rollback:"rollback-plan"]`
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Understand the Fix Requirements
|
||||
|
||||
Clarify what needs to be fixed and constraints:
|
||||
|
||||
**Key Information**:
|
||||
- **Root Cause**: Exact cause to address (from diagnosis)
|
||||
- **Scope**: What code/config/infrastructure needs changing
|
||||
- **Constraints**: Performance, backwards compatibility, security
|
||||
- **Verification**: How to verify the fix works
|
||||
- **Rollback**: Plan if fix causes problems
|
||||
|
||||
**Fix Strategy Questions**:
|
||||
```markdown
|
||||
- Is this a code fix, configuration fix, or infrastructure fix?
|
||||
- Are there multiple ways to fix this? Which is best?
|
||||
- What are the side effects of the fix?
|
||||
- Can we fix just the symptom or must we fix the root cause?
|
||||
- Is there existing code doing this correctly we can learn from?
|
||||
- What is the blast radius if the fix goes wrong?
|
||||
```
|
||||
|
||||
### 2. Design the Fix
|
||||
|
||||
Plan the implementation approach:
|
||||
|
||||
#### Fix Pattern Selection
|
||||
|
||||
**Code Fix Patterns**:
|
||||
|
||||
**1. Add Missing Error Handling**
|
||||
```javascript
|
||||
// Before (causes crashes)
|
||||
async function processPayment(orderId) {
|
||||
const order = await db.orders.findById(orderId);
|
||||
return await paymentGateway.charge(order.amount);
|
||||
}
|
||||
|
||||
// After (handles errors properly)
|
||||
async function processPayment(orderId) {
|
||||
try {
|
||||
const order = await db.orders.findById(orderId);
|
||||
|
||||
if (!order) {
|
||||
throw new Error(`Order ${orderId} not found`);
|
||||
}
|
||||
|
||||
if (order.status !== 'pending') {
|
||||
throw new Error(`Order ${orderId} is not in pending status`);
|
||||
}
|
||||
|
||||
const result = await paymentGateway.charge(order.amount);
|
||||
|
||||
if (!result.success) {
|
||||
throw new Error(`Payment failed: ${result.error}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
logger.error('Payment processing failed', {
|
||||
orderId,
|
||||
error: error.message,
|
||||
stack: error.stack
|
||||
});
|
||||
throw new PaymentError(`Failed to process payment for order ${orderId}`, error);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**2. Fix Race Condition**
|
||||
```javascript
|
||||
// Before (race condition)
|
||||
let cache = null;
|
||||
|
||||
async function getData() {
|
||||
if (!cache) {
|
||||
cache = await fetchFromDatabase(); // Multiple concurrent calls
|
||||
}
|
||||
return cache;
|
||||
}
|
||||
|
||||
// After (properly synchronized)
|
||||
let cache = null;
|
||||
let cachePromise = null;
|
||||
|
||||
async function getData() {
|
||||
if (!cache) {
|
||||
if (!cachePromise) {
|
||||
cachePromise = fetchFromDatabase();
|
||||
}
|
||||
cache = await cachePromise;
|
||||
cachePromise = null;
|
||||
}
|
||||
return cache;
|
||||
}
|
||||
|
||||
// Or use a proper caching library
|
||||
const { promiseMemoize } = require('promise-memoize');
|
||||
const getData = promiseMemoize(async () => {
|
||||
return await fetchFromDatabase();
|
||||
}, { maxAge: 60000 });
|
||||
```
|
||||
|
||||
**3. Fix Memory Leak**
|
||||
```javascript
|
||||
// Before (memory leak)
|
||||
class Component extends React.Component {
|
||||
componentDidMount() {
|
||||
window.addEventListener('resize', this.handleResize);
|
||||
this.interval = setInterval(this.fetchData, 5000);
|
||||
}
|
||||
|
||||
// componentWillUnmount missing - listeners never removed
|
||||
}
|
||||
|
||||
// After (properly cleaned up)
|
||||
class Component extends React.Component {
|
||||
componentDidMount() {
|
||||
window.addEventListener('resize', this.handleResize);
|
||||
this.interval = setInterval(this.fetchData, 5000);
|
||||
}
|
||||
|
||||
componentWillUnmount() {
|
||||
window.removeEventListener('resize', this.handleResize);
|
||||
clearInterval(this.interval);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**4. Add Missing Validation**
|
||||
```javascript
|
||||
// Before (no validation)
|
||||
app.post('/api/users', async (req, res) => {
|
||||
const user = await db.users.create(req.body);
|
||||
res.json(user);
|
||||
});
|
||||
|
||||
// After (proper validation)
|
||||
const { body, validationResult } = require('express-validator');
|
||||
|
||||
app.post('/api/users',
|
||||
// Validation middleware
|
||||
body('email').isEmail().normalizeEmail(),
|
||||
body('password').isLength({ min: 8 }).matches(/[A-Z]/).matches(/[0-9]/),
|
||||
body('age').optional().isInt({ min: 0, max: 150 }),
|
||||
|
||||
async (req, res) => {
|
||||
// Check validation results
|
||||
const errors = validationResult(req);
|
||||
if (!errors.isEmpty()) {
|
||||
return res.status(400).json({ errors: errors.array() });
|
||||
}
|
||||
|
||||
try {
|
||||
const user = await db.users.create({
|
||||
email: req.body.email,
|
||||
password: await hashPassword(req.body.password),
|
||||
age: req.body.age
|
||||
});
|
||||
|
||||
res.json(user);
|
||||
} catch (error) {
|
||||
logger.error('User creation failed', error);
|
||||
res.status(500).json({ error: 'Failed to create user' });
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
**5. Fix N+1 Query Problem**
|
||||
```javascript
|
||||
// Before (N+1 queries)
|
||||
async function getUsersWithOrders() {
|
||||
const users = await db.users.findAll();
|
||||
|
||||
for (const user of users) {
|
||||
user.orders = await db.orders.findByUserId(user.id); // N queries
|
||||
}
|
||||
|
||||
return users;
|
||||
}
|
||||
|
||||
// After (single query with join)
|
||||
async function getUsersWithOrders() {
|
||||
const users = await db.users.findAll({
|
||||
include: [
|
||||
{ model: db.orders, as: 'orders' }
|
||||
]
|
||||
});
|
||||
|
||||
return users;
|
||||
}
|
||||
|
||||
// Or with eager loading
|
||||
async function getUsersWithOrders() {
|
||||
const users = await db.users.findAll();
|
||||
const userIds = users.map(u => u.id);
|
||||
const orders = await db.orders.findAll({
|
||||
where: { userId: userIds }
|
||||
});
|
||||
|
||||
// Group orders by userId
|
||||
const ordersByUser = orders.reduce((acc, order) => {
|
||||
if (!acc[order.userId]) acc[order.userId] = [];
|
||||
acc[order.userId].push(order);
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
// Attach to users
|
||||
users.forEach(user => {
|
||||
user.orders = ordersByUser[user.id] || [];
|
||||
});
|
||||
|
||||
return users;
|
||||
}
|
||||
```
|
||||
|
||||
**Configuration Fix Patterns**:
|
||||
|
||||
**1. Fix Missing Environment Variable**
|
||||
```bash
|
||||
# Before (hardcoded)
|
||||
DATABASE_URL=postgresql://localhost/myapp
|
||||
|
||||
# After (environment-specific)
|
||||
# .env.production
|
||||
DATABASE_URL=postgresql://prod-db.example.com:5432/myapp_prod?sslmode=require
|
||||
|
||||
# Application code should validate required vars
|
||||
const requiredEnvVars = ['DATABASE_URL', 'API_KEY', 'SECRET_KEY'];
|
||||
for (const envVar of requiredEnvVars) {
|
||||
if (!process.env[envVar]) {
|
||||
throw new Error(`Required environment variable ${envVar} is not set`);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**2. Fix Resource Limits**
|
||||
```yaml
|
||||
# Before (no limits - causes OOM)
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: myapp:latest
|
||||
|
||||
# After (proper resource limits)
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: myapp:latest
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
```
|
||||
|
||||
**Infrastructure Fix Patterns**:
|
||||
|
||||
**1. Fix Nginx Upload Size Limit**
|
||||
```nginx
|
||||
# Before (default 1MB limit)
|
||||
server {
|
||||
listen 80;
|
||||
server_name example.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://localhost:3000;
|
||||
}
|
||||
}
|
||||
|
||||
# After (increased limit)
|
||||
server {
|
||||
listen 80;
|
||||
server_name example.com;
|
||||
|
||||
# Increase max body size
|
||||
client_max_body_size 50M;
|
||||
|
||||
location / {
|
||||
proxy_pass http://localhost:3000;
|
||||
|
||||
# Increase timeouts for large uploads
|
||||
proxy_read_timeout 300s;
|
||||
proxy_connect_timeout 75s;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**2. Add Missing Database Index**
|
||||
```sql
|
||||
-- Before (slow query)
|
||||
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'user@example.com';
|
||||
-- Seq Scan on users (cost=0.00..1234.56 rows=1 width=123) (actual time=45.123..45.124 rows=1 loops=1)
|
||||
|
||||
-- After (add index)
|
||||
CREATE INDEX idx_users_email ON users(email);
|
||||
|
||||
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'user@example.com';
|
||||
-- Index Scan using idx_users_email on users (cost=0.29..8.30 rows=1 width=123) (actual time=0.012..0.013 rows=1 loops=1)
|
||||
```
|
||||
|
||||
### 3. Implement the Fix
|
||||
|
||||
Execute the implementation with safety measures:
|
||||
|
||||
#### Implementation Checklist
|
||||
|
||||
**Pre-Implementation**:
|
||||
- [ ] Create feature branch from main
|
||||
- [ ] Review related code for similar issues
|
||||
- [ ] Identify all affected areas
|
||||
- [ ] Plan rollback strategy
|
||||
- [ ] Prepare monitoring queries
|
||||
|
||||
**During Implementation**:
|
||||
```bash
|
||||
# Create feature branch
|
||||
git checkout -b fix/issue-description
|
||||
|
||||
# Make changes incrementally
|
||||
# Test after each change
|
||||
|
||||
# Commit with clear messages
|
||||
git add file1.js
|
||||
git commit -m "fix: add error handling to payment processing"
|
||||
|
||||
git add file2.js
|
||||
git commit -m "fix: add validation for order status"
|
||||
```
|
||||
|
||||
**Code Changes with Safety**:
|
||||
```javascript
|
||||
// Add defensive checks
|
||||
function processOrder(order) {
|
||||
// Validate inputs
|
||||
if (!order) {
|
||||
throw new Error('Order is required');
|
||||
}
|
||||
|
||||
if (!order.id) {
|
||||
throw new Error('Order must have an id');
|
||||
}
|
||||
|
||||
// Log for debugging
|
||||
logger.debug('Processing order', { orderId: order.id });
|
||||
|
||||
try {
|
||||
// Main logic
|
||||
const result = doProcessing(order);
|
||||
|
||||
// Validate output
|
||||
if (!result || !result.success) {
|
||||
throw new Error('Processing did not return success');
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
// Enhanced error context
|
||||
logger.error('Order processing failed', {
|
||||
orderId: order.id,
|
||||
error: error.message,
|
||||
stack: error.stack
|
||||
});
|
||||
|
||||
// Re-throw with context
|
||||
throw new ProcessingError(`Failed to process order ${order.id}`, error);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Configuration Changes with Rollback**:
|
||||
```bash
|
||||
# Backup current config
|
||||
cp /etc/nginx/nginx.conf /etc/nginx/nginx.conf.backup.$(date +%Y%m%d)
|
||||
|
||||
# Make changes
|
||||
sudo vim /etc/nginx/nginx.conf
|
||||
|
||||
# Test configuration before applying
|
||||
sudo nginx -t
|
||||
|
||||
# If test passes, reload
|
||||
sudo nginx -s reload
|
||||
|
||||
# If issues occur, rollback
|
||||
# sudo cp /etc/nginx/nginx.conf.backup.YYYYMMDD /etc/nginx/nginx.conf
|
||||
# sudo nginx -s reload
|
||||
```
|
||||
|
||||
**Database Changes with Safety**:
|
||||
```sql
|
||||
-- Start transaction
|
||||
BEGIN;
|
||||
|
||||
-- Create index concurrently (doesn't lock table)
|
||||
CREATE INDEX CONCURRENTLY idx_users_email ON users(email);
|
||||
|
||||
-- Verify index was created
|
||||
\d users
|
||||
|
||||
-- Test query with new index
|
||||
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'test@example.com';
|
||||
|
||||
-- If all looks good, commit
|
||||
COMMIT;
|
||||
|
||||
-- If issues, rollback
|
||||
-- ROLLBACK;
|
||||
-- DROP INDEX idx_users_email;
|
||||
```
|
||||
|
||||
### 4. Add Safeguards
|
||||
|
||||
Implement safeguards to prevent recurrence:
|
||||
|
||||
**Safeguard Types**:
|
||||
|
||||
**1. Input Validation**
|
||||
```javascript
|
||||
// Add schema validation
|
||||
const Joi = require('joi');
|
||||
|
||||
const orderSchema = Joi.object({
|
||||
id: Joi.string().uuid().required(),
|
||||
userId: Joi.string().uuid().required(),
|
||||
amount: Joi.number().positive().required(),
|
||||
currency: Joi.string().length(3).required(),
|
||||
status: Joi.string().valid('pending', 'processing', 'completed', 'failed').required()
|
||||
});
|
||||
|
||||
function validateOrder(order) {
|
||||
const { error, value } = orderSchema.validate(order);
|
||||
if (error) {
|
||||
throw new ValidationError(`Invalid order: ${error.message}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
```
|
||||
|
||||
**2. Rate Limiting**
|
||||
```javascript
|
||||
const rateLimit = require('express-rate-limit');
|
||||
|
||||
// Prevent abuse
|
||||
const limiter = rateLimit({
|
||||
windowMs: 15 * 60 * 1000, // 15 minutes
|
||||
max: 100, // limit each IP to 100 requests per windowMs
|
||||
message: 'Too many requests from this IP'
|
||||
});
|
||||
|
||||
app.use('/api/', limiter);
|
||||
```
|
||||
|
||||
**3. Circuit Breaker**
|
||||
```javascript
|
||||
const CircuitBreaker = require('opossum');
|
||||
|
||||
// Protect against cascading failures
|
||||
const breaker = new CircuitBreaker(externalApiCall, {
|
||||
timeout: 3000, // 3 seconds
|
||||
errorThresholdPercentage: 50,
|
||||
resetTimeout: 30000 // 30 seconds
|
||||
});
|
||||
|
||||
breaker.fallback(() => {
|
||||
return { cached: true, data: getCachedData() };
|
||||
});
|
||||
|
||||
async function callExternalApi(params) {
|
||||
return await breaker.fire(params);
|
||||
}
|
||||
```
|
||||
|
||||
**4. Retry Logic**
|
||||
```javascript
|
||||
const retry = require('async-retry');
|
||||
|
||||
async function robustApiCall(params) {
|
||||
return await retry(
|
||||
async (bail) => {
|
||||
try {
|
||||
return await apiCall(params);
|
||||
} catch (error) {
|
||||
// Don't retry client errors
|
||||
if (error.statusCode >= 400 && error.statusCode < 500) {
|
||||
bail(error);
|
||||
return;
|
||||
}
|
||||
// Retry server errors
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
{
|
||||
retries: 3,
|
||||
minTimeout: 1000,
|
||||
maxTimeout: 5000,
|
||||
factor: 2
|
||||
}
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
**5. Graceful Degradation**
|
||||
```javascript
|
||||
async function getRecommendations(userId) {
|
||||
try {
|
||||
// Try ML-based recommendations
|
||||
return await mlRecommendationService.getRecommendations(userId);
|
||||
} catch (error) {
|
||||
logger.warn('ML recommendations failed, falling back to rule-based', error);
|
||||
|
||||
try {
|
||||
// Fallback to rule-based
|
||||
return await ruleBasedRecommendations(userId);
|
||||
} catch (error2) {
|
||||
logger.error('All recommendation methods failed', error2);
|
||||
|
||||
// Final fallback to popular items
|
||||
return await getPopularItems();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Verification
|
||||
|
||||
Thoroughly verify the fix works:
|
||||
|
||||
**Verification Levels**:
|
||||
|
||||
**Level 1: Unit Tests**
|
||||
```javascript
|
||||
describe('processPayment', () => {
|
||||
it('should handle missing order gracefully', async () => {
|
||||
await expect(processPayment('nonexistent-id'))
|
||||
.rejects
|
||||
.toThrow('Order nonexistent-id not found');
|
||||
});
|
||||
|
||||
it('should reject orders not in pending status', async () => {
|
||||
const completedOrder = await createTestOrder({ status: 'completed' });
|
||||
|
||||
await expect(processPayment(completedOrder.id))
|
||||
.rejects
|
||||
.toThrow('is not in pending status');
|
||||
});
|
||||
|
||||
it('should process valid pending orders', async () => {
|
||||
const order = await createTestOrder({ status: 'pending', amount: 100 });
|
||||
|
||||
const result = await processPayment(order.id);
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.transactionId).toBeDefined();
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**Level 2: Integration Tests**
|
||||
```javascript
|
||||
describe('Payment Integration', () => {
|
||||
it('should handle full payment flow', async () => {
|
||||
// Create order
|
||||
const order = await createOrder({ amount: 100 });
|
||||
expect(order.status).toBe('pending');
|
||||
|
||||
// Process payment
|
||||
const result = await processPayment(order.id);
|
||||
expect(result.success).toBe(true);
|
||||
|
||||
// Verify order updated
|
||||
const updatedOrder = await getOrder(order.id);
|
||||
expect(updatedOrder.status).toBe('completed');
|
||||
|
||||
// Verify transaction recorded
|
||||
const transaction = await getTransaction(result.transactionId);
|
||||
expect(transaction.orderId).toBe(order.id);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**Level 3: Manual Testing**
|
||||
```bash
|
||||
# Test the fix manually
|
||||
npm start
|
||||
|
||||
# In another terminal, reproduce the original issue
|
||||
curl -X POST http://localhost:3000/api/orders/12345/payment
|
||||
|
||||
# Verify fix
|
||||
# - Check response is successful
|
||||
# - Check logs for proper error handling
|
||||
# - Check database state is consistent
|
||||
```
|
||||
|
||||
**Level 4: Load Testing**
|
||||
```javascript
|
||||
// Use k6 for load testing
|
||||
import http from 'k6/http';
|
||||
import { check, sleep } from 'k6';
|
||||
|
||||
export let options = {
|
||||
stages: [
|
||||
{ duration: '2m', target: 100 }, // Ramp up to 100 users
|
||||
{ duration: '5m', target: 100 }, // Stay at 100 users
|
||||
{ duration: '2m', target: 0 }, // Ramp down
|
||||
],
|
||||
};
|
||||
|
||||
export default function () {
|
||||
let response = http.post('http://localhost:3000/api/orders/payment', {
|
||||
orderId: '12345'
|
||||
});
|
||||
|
||||
check(response, {
|
||||
'status is 200': (r) => r.status === 200,
|
||||
'no errors': (r) => !r.json('error')
|
||||
});
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
```
|
||||
|
||||
**Level 5: Production Smoke Test**
|
||||
```bash
|
||||
# After deployment, test in production
|
||||
# Use feature flag if possible
|
||||
|
||||
# Test with low traffic
|
||||
curl https://api.production.com/health
|
||||
curl https://api.production.com/api/test-endpoint
|
||||
|
||||
# Monitor metrics
|
||||
# - Error rate
|
||||
# - Response time
|
||||
# - Resource usage
|
||||
|
||||
# If issues detected, rollback immediately
|
||||
```
|
||||
|
||||
### 6. Prevention Measures
|
||||
|
||||
Add measures to prevent similar issues:
|
||||
|
||||
**Prevention Strategies**:
|
||||
|
||||
**1. Add Regression Tests**
|
||||
```javascript
|
||||
// This test would have caught the bug
|
||||
describe('Regression: Order Processing Bug #1234', () => {
|
||||
it('should not crash when order is missing', async () => {
|
||||
// This used to cause a crash
|
||||
await expect(processPayment('missing-order'))
|
||||
.rejects
|
||||
.toThrow('Order missing-order not found');
|
||||
// No crash, proper error thrown
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**2. Add Monitoring**
|
||||
```javascript
|
||||
// Add custom metrics
|
||||
const { Counter, Histogram } = require('prom-client');
|
||||
|
||||
const paymentErrors = new Counter({
|
||||
name: 'payment_processing_errors_total',
|
||||
help: 'Total payment processing errors',
|
||||
labelNames: ['error_type']
|
||||
});
|
||||
|
||||
const paymentDuration = new Histogram({
|
||||
name: 'payment_processing_duration_seconds',
|
||||
help: 'Payment processing duration'
|
||||
});
|
||||
|
||||
async function processPayment(orderId) {
|
||||
const end = paymentDuration.startTimer();
|
||||
|
||||
try {
|
||||
const result = await _processPayment(orderId);
|
||||
end({ status: 'success' });
|
||||
return result;
|
||||
} catch (error) {
|
||||
paymentErrors.inc({ error_type: error.constructor.name });
|
||||
end({ status: 'error' });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**3. Add Alerting**
|
||||
```yaml
|
||||
# Prometheus alert rules
|
||||
groups:
|
||||
- name: payment_processing
|
||||
rules:
|
||||
- alert: HighPaymentErrorRate
|
||||
expr: rate(payment_processing_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High payment error rate detected"
|
||||
description: "Payment error rate is {{ $value }} errors/sec"
|
||||
```
|
||||
|
||||
**4. Improve Logging**
|
||||
```javascript
|
||||
// Add structured logging
|
||||
logger.info('Processing payment', {
|
||||
orderId: order.id,
|
||||
amount: order.amount,
|
||||
userId: order.userId,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
|
||||
// Log key decision points
|
||||
logger.debug('Order validation passed', { orderId });
|
||||
logger.debug('Calling payment gateway', { orderId, amount });
|
||||
logger.debug('Payment gateway responded', { orderId, success: result.success });
|
||||
```
|
||||
|
||||
**5. Update Documentation**
|
||||
```markdown
|
||||
# Common Issues and Solutions
|
||||
|
||||
## Issue: Payment Processing Fails Silently
|
||||
|
||||
**Symptoms**: Orders stuck in pending status
|
||||
|
||||
**Root Cause**: Missing error handling in payment processor
|
||||
|
||||
**Solution**: Added comprehensive error handling and logging
|
||||
|
||||
**Prevention**:
|
||||
- All payment operations now have try-catch blocks
|
||||
- Errors are logged with full context
|
||||
- Alerts trigger on error rate > 10%
|
||||
|
||||
**Related Code**: src/services/payment-processor.js
|
||||
**Tests**: tests/integration/payment-processing.test.js
|
||||
**Monitoring**: Grafana dashboard "Payment Processing"
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
```markdown
|
||||
# Fix Report: [Issue Summary]
|
||||
|
||||
## Summary
|
||||
[Brief description of the fix implemented]
|
||||
|
||||
## Root Cause Addressed
|
||||
[Detailed explanation of what root cause this fix addresses]
|
||||
|
||||
## Changes Made
|
||||
|
||||
### Code Changes
|
||||
|
||||
#### File: [path/to/file1]
|
||||
**Purpose**: [Why this file was changed]
|
||||
|
||||
\`\`\`[language]
|
||||
// Before
|
||||
[original code]
|
||||
|
||||
// After
|
||||
[fixed code]
|
||||
|
||||
// Why this works
|
||||
[explanation]
|
||||
\`\`\`
|
||||
|
||||
#### File: [path/to/file2]
|
||||
**Purpose**: [Why this file was changed]
|
||||
|
||||
\`\`\`[language]
|
||||
[changes with before/after]
|
||||
\`\`\`
|
||||
|
||||
### Configuration Changes
|
||||
|
||||
#### File: [config/file]
|
||||
\`\`\`
|
||||
[configuration changes]
|
||||
\`\`\`
|
||||
**Impact**: [What this configuration change affects]
|
||||
|
||||
### Infrastructure Changes
|
||||
|
||||
#### Component: [infrastructure component]
|
||||
\`\`\`
|
||||
[infrastructure changes]
|
||||
\`\`\`
|
||||
**Impact**: [What this infrastructure change affects]
|
||||
|
||||
## Safeguards Added
|
||||
|
||||
### Input Validation
|
||||
[Validation added to prevent bad inputs]
|
||||
|
||||
### Error Handling
|
||||
[Error handling added for failure scenarios]
|
||||
|
||||
### Rate Limiting
|
||||
[Rate limiting or throttling added]
|
||||
|
||||
### Monitoring
|
||||
[Monitoring/metrics added]
|
||||
|
||||
### Alerting
|
||||
[Alerts configured]
|
||||
|
||||
## Verification Results
|
||||
|
||||
### Unit Tests
|
||||
\`\`\`
|
||||
[test results]
|
||||
\`\`\`
|
||||
**Status**: ✅ All tests passing
|
||||
|
||||
### Integration Tests
|
||||
\`\`\`
|
||||
[test results]
|
||||
\`\`\`
|
||||
**Status**: ✅ All tests passing
|
||||
|
||||
### Manual Testing
|
||||
[Description of manual testing performed]
|
||||
**Status**: ✅ Issue no longer reproduces
|
||||
|
||||
### Load Testing
|
||||
[Results of load testing]
|
||||
**Status**: ✅ Performs well under load
|
||||
|
||||
## Prevention Measures
|
||||
|
||||
### Tests Added
|
||||
- [Test 1]: Prevents regression
|
||||
- [Test 2]: Covers edge case
|
||||
|
||||
### Monitoring Added
|
||||
- [Metric 1]: Tracks error rate
|
||||
- [Metric 2]: Tracks performance
|
||||
|
||||
### Alerts Configured
|
||||
- [Alert 1]: Fires when error rate exceeds threshold
|
||||
- [Alert 2]: Fires when performance degrades
|
||||
|
||||
### Documentation Updated
|
||||
- [Doc 1]: Troubleshooting guide
|
||||
- [Doc 2]: Runbook for oncall
|
||||
|
||||
## Deployment Plan
|
||||
|
||||
### Pre-Deployment
|
||||
1. [Step 1]
|
||||
2. [Step 2]
|
||||
|
||||
### Deployment
|
||||
1. [Step 1]
|
||||
2. [Step 2]
|
||||
|
||||
### Post-Deployment
|
||||
1. [Step 1 - monitoring]
|
||||
2. [Step 2 - verification]
|
||||
|
||||
### Rollback Plan
|
||||
\`\`\`bash
|
||||
[commands to rollback if needed]
|
||||
\`\`\`
|
||||
|
||||
## Verification Steps
|
||||
|
||||
### How to Verify the Fix
|
||||
1. [Verification step 1]
|
||||
2. [Verification step 2]
|
||||
|
||||
### Expected Behavior After Fix
|
||||
[Description of expected behavior]
|
||||
|
||||
### Monitoring Queries
|
||||
\`\`\`
|
||||
[queries to monitor fix effectiveness]
|
||||
\`\`\`
|
||||
|
||||
## Related Issues
|
||||
|
||||
### Similar Issues Fixed
|
||||
- [Related issue 1]
|
||||
- [Related issue 2]
|
||||
|
||||
### Potential Similar Issues
|
||||
- [Potential issue 1 to check]
|
||||
- [Potential issue 2 to check]
|
||||
|
||||
## Lessons Learned
|
||||
[Key insights from implementing this fix]
|
||||
|
||||
## Files Modified
|
||||
- [file1]
|
||||
- [file2]
|
||||
- [file3]
|
||||
|
||||
## Commits
|
||||
\`\`\`
|
||||
[git log output showing fix commits]
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Fix Fails Verification**:
|
||||
If fix doesn't resolve the issue:
|
||||
1. Re-examine root cause analysis
|
||||
2. Check if multiple issues present
|
||||
3. Verify fix was implemented correctly
|
||||
4. Add more diagnostic logging
|
||||
|
||||
**Fix Causes New Issues**:
|
||||
If fix introduces side effects:
|
||||
1. Rollback immediately
|
||||
2. Analyze side effect cause
|
||||
3. Redesign fix to avoid side effect
|
||||
4. Add tests for side effect scenario
|
||||
|
||||
**Cannot Deploy Fix**:
|
||||
If deployment blocked:
|
||||
1. Implement workaround if possible
|
||||
2. Document deployment blockers
|
||||
3. Create deployment plan to address blockers
|
||||
4. Consider feature flag for gradual rollout
|
||||
|
||||
## Integration with Other Operations
|
||||
|
||||
- **Before**: Use `/debug diagnose` to identify root cause
|
||||
- **Before**: Use `/debug reproduce` to create test case
|
||||
- **After**: Use `/debug performance` if fix affects performance
|
||||
- **After**: Use `/debug memory` if fix affects memory usage
|
||||
|
||||
## Agent Utilization
|
||||
|
||||
This operation leverages the **10x-fullstack-engineer** agent for:
|
||||
- Designing robust fixes that address root causes
|
||||
- Implementing comprehensive safeguards
|
||||
- Creating thorough verification strategies
|
||||
- Considering performance and security implications
|
||||
- Planning prevention measures
|
||||
1006
commands/debug/memory.md
Normal file
1006
commands/debug/memory.md
Normal file
File diff suppressed because it is too large
Load Diff
965
commands/debug/performance.md
Normal file
965
commands/debug/performance.md
Normal file
@@ -0,0 +1,965 @@
|
||||
# Performance Operation - Performance Debugging and Profiling
|
||||
|
||||
You are executing the **performance** operation to debug performance issues, profile application behavior, and optimize system performance.
|
||||
|
||||
## Parameters
|
||||
|
||||
**Received**: `$ARGUMENTS` (after removing 'performance' operation name)
|
||||
|
||||
Expected format: `component:"component-name" [metric:"response-time|throughput|cpu|memory"] [threshold:"target-value"] [duration:"profile-duration"] [load:"concurrent-users"]`
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Establish Performance Baseline
|
||||
|
||||
Measure current performance before optimization:
|
||||
|
||||
**Baseline Metrics to Capture**:
|
||||
```bash
|
||||
# Response time baseline
|
||||
curl -w "@curl-format.txt" -o /dev/null -s http://localhost:3000/api/endpoint
|
||||
|
||||
# Create curl-format.txt
|
||||
cat > curl-format.txt <<'EOF'
|
||||
time_namelookup: %{time_namelookup}\n
|
||||
time_connect: %{time_connect}\n
|
||||
time_appconnect: %{time_appconnect}\n
|
||||
time_pretransfer: %{time_pretransfer}\n
|
||||
time_redirect: %{time_redirect}\n
|
||||
time_starttransfer: %{time_starttransfer}\n
|
||||
----------\n
|
||||
time_total: %{time_total}\n
|
||||
EOF
|
||||
|
||||
# Throughput baseline
|
||||
ab -n 1000 -c 10 http://localhost:3000/api/endpoint
|
||||
|
||||
# Resource usage baseline
|
||||
# CPU
|
||||
mpstat 1 60 > baseline_cpu.txt
|
||||
|
||||
# Memory
|
||||
free -m && ps aux --sort=-%mem | head -20
|
||||
|
||||
# Disk I/O
|
||||
iostat -x 1 60 > baseline_io.txt
|
||||
```
|
||||
|
||||
**Application Metrics**:
|
||||
```javascript
|
||||
// Add timing middleware
|
||||
app.use((req, res, next) => {
|
||||
const start = Date.now();
|
||||
|
||||
res.on('finish', () => {
|
||||
const duration = Date.now() - start;
|
||||
console.log({
|
||||
method: req.method,
|
||||
path: req.path,
|
||||
status: res.statusCode,
|
||||
duration: duration,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
});
|
||||
|
||||
next();
|
||||
});
|
||||
|
||||
// Track key operations
|
||||
const startTime = Date.now();
|
||||
await operation();
|
||||
const duration = Date.now() - startTime;
|
||||
metrics.histogram('operation_duration', duration);
|
||||
```
|
||||
|
||||
### 2. Identify Performance Bottlenecks
|
||||
|
||||
Use profiling to find slow components:
|
||||
|
||||
#### Application Profiling
|
||||
|
||||
**Node.js Profiling**:
|
||||
```bash
|
||||
# CPU profiling
|
||||
node --prof app.js
|
||||
# Run load test
|
||||
ab -n 10000 -c 100 http://localhost:3000/
|
||||
# Stop app, process profile
|
||||
node --prof-process isolate-*-v8.log > processed.txt
|
||||
|
||||
# Chrome DevTools profiling
|
||||
node --inspect app.js
|
||||
# Open chrome://inspect
|
||||
# Click "Open dedicated DevTools for Node"
|
||||
# Go to Profiler tab, start profiling
|
||||
|
||||
# Clinic.js for comprehensive profiling
|
||||
npm install -g clinic
|
||||
clinic doctor -- node app.js
|
||||
# Run load test
|
||||
clinic doctor --visualize-only PID.clinic-doctor
|
||||
```
|
||||
|
||||
**Python Profiling**:
|
||||
```python
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
# Profile a function
|
||||
cProfile.run('my_function()', 'profile_stats')
|
||||
|
||||
# Analyze results
|
||||
p = pstats.Stats('profile_stats')
|
||||
p.sort_stats('cumulative')
|
||||
p.print_stats(20)
|
||||
|
||||
# Line profiler for detailed profiling
|
||||
from line_profiler import LineProfiler
|
||||
|
||||
profiler = LineProfiler()
|
||||
profiler.add_function(my_function)
|
||||
profiler.run('my_function()')
|
||||
profiler.print_stats()
|
||||
|
||||
# Memory profiling
|
||||
from memory_profiler import profile
|
||||
|
||||
@profile
|
||||
def my_function():
|
||||
large_list = [i for i in range(1000000)]
|
||||
return sum(large_list)
|
||||
```
|
||||
|
||||
**Use profiling utility script**:
|
||||
```bash
|
||||
# Run comprehensive profiling
|
||||
./commands/debug/.scripts/profile.sh \
|
||||
--app node_app \
|
||||
--duration 60 \
|
||||
--endpoint http://localhost:3000/api/slow
|
||||
|
||||
# Output: CPU profile, memory profile, flamegraph
|
||||
```
|
||||
|
||||
#### Database Profiling
|
||||
|
||||
**Query Performance**:
|
||||
```sql
|
||||
-- PostgreSQL: Enable query timing
|
||||
\timing on
|
||||
|
||||
-- Analyze query plan
|
||||
EXPLAIN ANALYZE
|
||||
SELECT u.*, o.*
|
||||
FROM users u
|
||||
LEFT JOIN orders o ON u.id = o.user_id
|
||||
WHERE u.created_at > '2024-01-01';
|
||||
|
||||
-- Look for:
|
||||
-- - Seq Scan (sequential scan - bad for large tables)
|
||||
-- - High cost estimates
|
||||
-- - Large number of rows processed
|
||||
-- - Missing indexes
|
||||
|
||||
-- Check slow queries
|
||||
SELECT
|
||||
query,
|
||||
calls,
|
||||
total_time,
|
||||
mean_time,
|
||||
max_time
|
||||
FROM pg_stat_statements
|
||||
ORDER BY mean_time DESC
|
||||
LIMIT 20;
|
||||
|
||||
-- Find missing indexes
|
||||
SELECT
|
||||
schemaname,
|
||||
tablename,
|
||||
seq_scan,
|
||||
seq_tup_read,
|
||||
idx_scan,
|
||||
seq_tup_read / seq_scan AS avg_seq_read
|
||||
FROM pg_stat_user_tables
|
||||
WHERE seq_scan > 0
|
||||
ORDER BY seq_tup_read DESC
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
**Connection Pool Analysis**:
|
||||
```javascript
|
||||
// Monitor connection pool
|
||||
pool.on('acquire', (client) => {
|
||||
console.log('Client acquired:', {
|
||||
poolSize: pool.totalCount,
|
||||
idleCount: pool.idleCount,
|
||||
waitingCount: pool.waitingCount
|
||||
});
|
||||
});
|
||||
|
||||
pool.on('remove', (client) => {
|
||||
console.log('Client removed from pool');
|
||||
});
|
||||
|
||||
// Check pool stats periodically
|
||||
setInterval(() => {
|
||||
console.log('Pool stats:', {
|
||||
total: pool.totalCount,
|
||||
idle: pool.idleCount,
|
||||
waiting: pool.waitingCount
|
||||
});
|
||||
}, 10000);
|
||||
```
|
||||
|
||||
#### Network Profiling
|
||||
|
||||
**API Call Analysis**:
|
||||
```bash
|
||||
# Trace network calls
|
||||
strace -c -p PID # System call tracing
|
||||
|
||||
# Detailed network timing
|
||||
tcpdump -i any -w capture.pcap port 3000
|
||||
# Analyze with Wireshark
|
||||
|
||||
# HTTP request tracing
|
||||
curl -w "@curl-format.txt" -v http://localhost:3000/api/endpoint
|
||||
|
||||
# Check DNS resolution
|
||||
time nslookup api.example.com
|
||||
|
||||
# Check network latency
|
||||
ping -c 10 api.example.com
|
||||
```
|
||||
|
||||
**Browser Performance**:
|
||||
```javascript
|
||||
// Use Performance API
|
||||
performance.mark('start-operation');
|
||||
await operation();
|
||||
performance.mark('end-operation');
|
||||
performance.measure('operation', 'start-operation', 'end-operation');
|
||||
|
||||
const measure = performance.getEntriesByName('operation')[0];
|
||||
console.log('Operation took:', measure.duration, 'ms');
|
||||
|
||||
// Navigation timing
|
||||
const perfData = performance.getEntriesByType('navigation')[0];
|
||||
console.log({
|
||||
dns: perfData.domainLookupEnd - perfData.domainLookupStart,
|
||||
tcp: perfData.connectEnd - perfData.connectStart,
|
||||
ttfb: perfData.responseStart - perfData.requestStart,
|
||||
download: perfData.responseEnd - perfData.responseStart,
|
||||
domReady: perfData.domContentLoadedEventEnd - perfData.domContentLoadedEventStart,
|
||||
load: perfData.loadEventEnd - perfData.loadEventStart
|
||||
});
|
||||
|
||||
// Resource timing
|
||||
performance.getEntriesByType('resource').forEach(resource => {
|
||||
console.log(resource.name, resource.duration);
|
||||
});
|
||||
```
|
||||
|
||||
### 3. Analyze Bottlenecks
|
||||
|
||||
Understand why components are slow:
|
||||
|
||||
#### CPU Bottlenecks
|
||||
|
||||
**Identify CPU-intensive operations**:
|
||||
```javascript
|
||||
// Find CPU-heavy code
|
||||
const { performance } = require('perf_hooks');
|
||||
|
||||
function analyzePerformance() {
|
||||
const start = performance.now();
|
||||
|
||||
// Suspect operation
|
||||
const result = expensiveOperation();
|
||||
|
||||
const duration = performance.now() - start;
|
||||
if (duration > 100) { // More than 100ms
|
||||
console.warn('CPU-intensive operation detected:', {
|
||||
operation: 'expensiveOperation',
|
||||
duration: duration
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
```
|
||||
|
||||
**Common CPU bottlenecks**:
|
||||
- Complex regex operations
|
||||
- Large array/object operations
|
||||
- JSON parsing/stringifying large objects
|
||||
- Synchronous file operations
|
||||
- Cryptographic operations
|
||||
- Image processing
|
||||
|
||||
**Solutions**:
|
||||
```javascript
|
||||
// Before: Synchronous blocking
|
||||
const data = JSON.parse(largeJsonString);
|
||||
|
||||
// After: Async in worker thread
|
||||
const { Worker } = require('worker_threads');
|
||||
|
||||
function parseJsonAsync(jsonString) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const worker = new Worker(`
|
||||
const { parentPort } = require('worker_threads');
|
||||
parentPort.on('message', (data) => {
|
||||
const parsed = JSON.parse(data);
|
||||
parentPort.postMessage(parsed);
|
||||
});
|
||||
`, { eval: true });
|
||||
|
||||
worker.on('message', resolve);
|
||||
worker.on('error', reject);
|
||||
worker.postMessage(jsonString);
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
#### I/O Bottlenecks
|
||||
|
||||
**Identify I/O-bound operations**:
|
||||
```javascript
|
||||
// Monitor I/O operations
|
||||
const fs = require('fs').promises;
|
||||
|
||||
async function monitoredFileRead(path) {
|
||||
const start = Date.now();
|
||||
try {
|
||||
const data = await fs.readFile(path);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
console.log('File read:', { path, duration, size: data.length });
|
||||
|
||||
if (duration > 50) {
|
||||
console.warn('Slow file read detected:', path);
|
||||
}
|
||||
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error('File read failed:', { path, error });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Common I/O bottlenecks**:
|
||||
- Multiple database queries in sequence (N+1 problem)
|
||||
- Synchronous file operations
|
||||
- External API calls in sequence
|
||||
- Large file uploads/downloads
|
||||
|
||||
**Solutions**:
|
||||
```javascript
|
||||
// Before: Sequential queries (N+1)
|
||||
const users = await User.findAll();
|
||||
for (const user of users) {
|
||||
user.posts = await Post.findByUserId(user.id); // N queries
|
||||
}
|
||||
|
||||
// After: Single query with join
|
||||
const users = await User.findAll({
|
||||
include: [{ model: Post }]
|
||||
});
|
||||
|
||||
// Before: Sequential API calls
|
||||
const user = await fetchUser(userId);
|
||||
const orders = await fetchOrders(userId);
|
||||
const profile = await fetchProfile(userId);
|
||||
|
||||
// After: Parallel execution
|
||||
const [user, orders, profile] = await Promise.all([
|
||||
fetchUser(userId),
|
||||
fetchOrders(userId),
|
||||
fetchProfile(userId)
|
||||
]);
|
||||
```
|
||||
|
||||
#### Memory Bottlenecks
|
||||
|
||||
**Identify memory issues**:
|
||||
```javascript
|
||||
// Monitor memory usage
|
||||
function logMemoryUsage(label) {
|
||||
const usage = process.memoryUsage();
|
||||
console.log(`[${label}] Memory:`, {
|
||||
rss: Math.round(usage.rss / 1024 / 1024) + 'MB',
|
||||
heapTotal: Math.round(usage.heapTotal / 1024 / 1024) + 'MB',
|
||||
heapUsed: Math.round(usage.heapUsed / 1024 / 1024) + 'MB',
|
||||
external: Math.round(usage.external / 1024 / 1024) + 'MB'
|
||||
});
|
||||
}
|
||||
|
||||
logMemoryUsage('before-operation');
|
||||
await operation();
|
||||
logMemoryUsage('after-operation');
|
||||
```
|
||||
|
||||
**Common memory bottlenecks**:
|
||||
- Loading large datasets into memory
|
||||
- Caching without size limits
|
||||
- Memory leaks (event listeners, closures)
|
||||
- Large object allocations
|
||||
|
||||
**Solutions**:
|
||||
```javascript
|
||||
// Before: Load entire file into memory
|
||||
const data = await fs.readFile('large-file.csv', 'utf8');
|
||||
const lines = data.split('\n');
|
||||
|
||||
// After: Stream processing
|
||||
const readline = require('readline');
|
||||
const stream = fs.createReadStream('large-file.csv');
|
||||
const rl = readline.createInterface({ input: stream });
|
||||
|
||||
for await (const line of rl) {
|
||||
processLine(line); // Process one line at a time
|
||||
}
|
||||
|
||||
// Before: Unbounded cache
|
||||
const cache = {};
|
||||
cache[key] = value; // Grows forever
|
||||
|
||||
// After: LRU cache with size limit
|
||||
const LRU = require('lru-cache');
|
||||
const cache = new LRU({
|
||||
max: 1000, // Max items
|
||||
maxSize: 50 * 1024 * 1024, // 50MB
|
||||
sizeCalculation: (value) => JSON.stringify(value).length
|
||||
});
|
||||
```
|
||||
|
||||
### 4. Implement Optimizations
|
||||
|
||||
Apply targeted optimizations:
|
||||
|
||||
#### Query Optimization
|
||||
|
||||
**Add Indexes**:
|
||||
```sql
|
||||
-- Before: Slow query
|
||||
EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 123;
|
||||
-- Seq Scan on orders (cost=0.00..1234.56 rows=10 width=100) (actual time=45.123..45.456 rows=10 loops=1)
|
||||
|
||||
-- After: Add index
|
||||
CREATE INDEX idx_orders_user_id ON orders(user_id);
|
||||
|
||||
EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 123;
|
||||
-- Index Scan using idx_orders_user_id on orders (cost=0.29..8.30 rows=10 width=100) (actual time=0.012..0.015 rows=10 loops=1)
|
||||
```
|
||||
|
||||
**Optimize Queries**:
|
||||
```sql
|
||||
-- Before: Inefficient
|
||||
SELECT * FROM orders o
|
||||
LEFT JOIN users u ON o.user_id = u.id
|
||||
WHERE o.created_at > NOW() - INTERVAL '7 days';
|
||||
|
||||
-- After: Select only needed columns, add index
|
||||
CREATE INDEX idx_orders_created_at ON orders(created_at);
|
||||
|
||||
SELECT o.id, o.amount, u.name
|
||||
FROM orders o
|
||||
INNER JOIN users u ON o.user_id = u.id
|
||||
WHERE o.created_at > NOW() - INTERVAL '7 days';
|
||||
```
|
||||
|
||||
#### Caching
|
||||
|
||||
**Application-level caching**:
|
||||
```javascript
|
||||
const cache = new Map();
|
||||
|
||||
async function getCachedData(key) {
|
||||
// Check cache first
|
||||
if (cache.has(key)) {
|
||||
console.log('Cache hit:', key);
|
||||
return cache.get(key);
|
||||
}
|
||||
|
||||
// Cache miss - fetch from database
|
||||
console.log('Cache miss:', key);
|
||||
const data = await fetchFromDatabase(key);
|
||||
|
||||
// Store in cache
|
||||
cache.set(key, data);
|
||||
|
||||
// Expire after 5 minutes
|
||||
setTimeout(() => cache.delete(key), 5 * 60 * 1000);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
// Redis caching
|
||||
const redis = require('redis');
|
||||
const client = redis.createClient();
|
||||
|
||||
async function getCachedDataRedis(key) {
|
||||
// Try cache
|
||||
const cached = await client.get(key);
|
||||
if (cached) {
|
||||
return JSON.parse(cached);
|
||||
}
|
||||
|
||||
// Fetch and cache
|
||||
const data = await fetchFromDatabase(key);
|
||||
await client.setEx(key, 300, JSON.stringify(data)); // 5 min TTL
|
||||
|
||||
return data;
|
||||
}
|
||||
```
|
||||
|
||||
#### Code Optimization
|
||||
|
||||
**Optimize algorithms**:
|
||||
```javascript
|
||||
// Before: O(n²) - slow for large arrays
|
||||
function findDuplicates(arr) {
|
||||
const duplicates = [];
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
for (let j = i + 1; j < arr.length; j++) {
|
||||
if (arr[i] === arr[j]) {
|
||||
duplicates.push(arr[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return duplicates;
|
||||
}
|
||||
|
||||
// After: O(n) - much faster
|
||||
function findDuplicates(arr) {
|
||||
const seen = new Set();
|
||||
const duplicates = new Set();
|
||||
|
||||
for (const item of arr) {
|
||||
if (seen.has(item)) {
|
||||
duplicates.add(item);
|
||||
} else {
|
||||
seen.add(item);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(duplicates);
|
||||
}
|
||||
```
|
||||
|
||||
**Lazy loading**:
|
||||
```javascript
|
||||
// Before: Load all data upfront
|
||||
const allUsers = await User.findAll();
|
||||
const allPosts = await Post.findAll();
|
||||
|
||||
// After: Load on demand
|
||||
async function getUserWithPosts(userId) {
|
||||
const user = await User.findById(userId);
|
||||
// Only load posts when needed
|
||||
if (needsPosts) {
|
||||
user.posts = await Post.findByUserId(userId);
|
||||
}
|
||||
return user;
|
||||
}
|
||||
```
|
||||
|
||||
**Pagination**:
|
||||
```javascript
|
||||
// Before: Load all results
|
||||
const results = await db.query('SELECT * FROM large_table');
|
||||
|
||||
// After: Paginate
|
||||
const page = 1;
|
||||
const pageSize = 100;
|
||||
const results = await db.query(
|
||||
'SELECT * FROM large_table LIMIT $1 OFFSET $2',
|
||||
[pageSize, (page - 1) * pageSize]
|
||||
);
|
||||
```
|
||||
|
||||
#### Async Optimization
|
||||
|
||||
**Parallel execution**:
|
||||
```javascript
|
||||
// Before: Sequential (slow)
|
||||
const user = await fetchUser();
|
||||
const orders = await fetchOrders();
|
||||
const payments = await fetchPayments();
|
||||
// Total time: time(user) + time(orders) + time(payments)
|
||||
|
||||
// After: Parallel (fast)
|
||||
const [user, orders, payments] = await Promise.all([
|
||||
fetchUser(),
|
||||
fetchOrders(),
|
||||
fetchPayments()
|
||||
]);
|
||||
// Total time: max(time(user), time(orders), time(payments))
|
||||
```
|
||||
|
||||
**Batch processing**:
|
||||
```javascript
|
||||
// Before: Process one at a time
|
||||
for (const item of items) {
|
||||
await processItem(item); // Slow for many items
|
||||
}
|
||||
|
||||
// After: Process in batches
|
||||
const batchSize = 10;
|
||||
for (let i = 0; i < items.length; i += batchSize) {
|
||||
const batch = items.slice(i, i + batchSize);
|
||||
await Promise.all(batch.map(item => processItem(item)));
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Load Testing
|
||||
|
||||
Verify optimizations under load:
|
||||
|
||||
**Load Testing Tools**:
|
||||
|
||||
**Apache Bench**:
|
||||
```bash
|
||||
# Simple load test
|
||||
ab -n 10000 -c 100 http://localhost:3000/api/endpoint
|
||||
|
||||
# With keep-alive
|
||||
ab -n 10000 -c 100 -k http://localhost:3000/api/endpoint
|
||||
|
||||
# POST with data
|
||||
ab -n 1000 -c 10 -p data.json -T application/json http://localhost:3000/api/endpoint
|
||||
```
|
||||
|
||||
**k6 (recommended)**:
|
||||
```javascript
|
||||
// load-test.js
|
||||
import http from 'k6/http';
|
||||
import { check, sleep } from 'k6';
|
||||
|
||||
export let options = {
|
||||
stages: [
|
||||
{ duration: '2m', target: 100 }, // Ramp up to 100 users
|
||||
{ duration: '5m', target: 100 }, // Stay at 100 users
|
||||
{ duration: '2m', target: 200 }, // Ramp up to 200 users
|
||||
{ duration: '5m', target: 200 }, // Stay at 200 users
|
||||
{ duration: '2m', target: 0 }, // Ramp down to 0
|
||||
],
|
||||
thresholds: {
|
||||
http_req_duration: ['p(95)<500'], // 95% of requests < 500ms
|
||||
http_req_failed: ['rate<0.01'], // Error rate < 1%
|
||||
},
|
||||
};
|
||||
|
||||
export default function () {
|
||||
const response = http.get('http://localhost:3000/api/endpoint');
|
||||
|
||||
check(response, {
|
||||
'status is 200': (r) => r.status === 200,
|
||||
'response time < 500ms': (r) => r.timings.duration < 500,
|
||||
});
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
# Run load test
|
||||
k6 run load-test.js
|
||||
|
||||
# With real-time monitoring
|
||||
k6 run --out influxdb=http://localhost:8086/k6 load-test.js
|
||||
```
|
||||
|
||||
**Artillery**:
|
||||
```yaml
|
||||
# load-test.yml
|
||||
config:
|
||||
target: 'http://localhost:3000'
|
||||
phases:
|
||||
- duration: 120
|
||||
arrivalRate: 10
|
||||
name: "Warm up"
|
||||
- duration: 300
|
||||
arrivalRate: 50
|
||||
name: "Sustained load"
|
||||
- duration: 120
|
||||
arrivalRate: 100
|
||||
name: "Peak load"
|
||||
|
||||
scenarios:
|
||||
- name: "API endpoints"
|
||||
flow:
|
||||
- get:
|
||||
url: "/api/users"
|
||||
- get:
|
||||
url: "/api/orders"
|
||||
- post:
|
||||
url: "/api/orders"
|
||||
json:
|
||||
userId: 123
|
||||
amount: 100
|
||||
```
|
||||
|
||||
```bash
|
||||
# Run test
|
||||
artillery run load-test.yml
|
||||
|
||||
# With report
|
||||
artillery run --output report.json load-test.yml
|
||||
artillery report report.json
|
||||
```
|
||||
|
||||
### 6. Monitor Performance Improvements
|
||||
|
||||
Compare before and after:
|
||||
|
||||
**Metrics to Compare**:
|
||||
```markdown
|
||||
## Before Optimization
|
||||
- Response time P50: 200ms
|
||||
- Response time P95: 800ms
|
||||
- Response time P99: 2000ms
|
||||
- Throughput: 100 req/s
|
||||
- Error rate: 2%
|
||||
- CPU usage: 80%
|
||||
- Memory usage: 1.5GB
|
||||
|
||||
## After Optimization
|
||||
- Response time P50: 50ms ✅ 75% improvement
|
||||
- Response time P95: 200ms ✅ 75% improvement
|
||||
- Response time P99: 500ms ✅ 75% improvement
|
||||
- Throughput: 400 req/s ✅ 4x improvement
|
||||
- Error rate: 0.1% ✅ 20x improvement
|
||||
- CPU usage: 40% ✅ 50% reduction
|
||||
- Memory usage: 800MB ✅ 47% reduction
|
||||
```
|
||||
|
||||
**Monitoring Dashboard**:
|
||||
```javascript
|
||||
// Expose metrics for Prometheus
|
||||
const promClient = require('prom-client');
|
||||
|
||||
// Response time histogram
|
||||
const httpDuration = new promClient.Histogram({
|
||||
name: 'http_request_duration_seconds',
|
||||
help: 'HTTP request duration',
|
||||
labelNames: ['method', 'route', 'status_code'],
|
||||
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
|
||||
});
|
||||
|
||||
// Throughput counter
|
||||
const httpRequests = new promClient.Counter({
|
||||
name: 'http_requests_total',
|
||||
help: 'Total HTTP requests',
|
||||
labelNames: ['method', 'route', 'status_code']
|
||||
});
|
||||
|
||||
// Middleware to track metrics
|
||||
app.use((req, res, next) => {
|
||||
const start = Date.now();
|
||||
|
||||
res.on('finish', () => {
|
||||
const duration = (Date.now() - start) / 1000;
|
||||
|
||||
httpDuration.observe(
|
||||
{ method: req.method, route: req.route?.path || req.path, status_code: res.statusCode },
|
||||
duration
|
||||
);
|
||||
|
||||
httpRequests.inc({
|
||||
method: req.method,
|
||||
route: req.route?.path || req.path,
|
||||
status_code: res.statusCode
|
||||
});
|
||||
});
|
||||
|
||||
next();
|
||||
});
|
||||
|
||||
// Metrics endpoint
|
||||
app.get('/metrics', async (req, res) => {
|
||||
res.set('Content-Type', promClient.register.contentType);
|
||||
res.end(await promClient.register.metrics());
|
||||
});
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
```markdown
|
||||
# Performance Optimization Report: [Component Name]
|
||||
|
||||
## Summary
|
||||
[Brief summary of optimization results]
|
||||
|
||||
## Performance Baseline
|
||||
|
||||
### Before Optimization
|
||||
- **Response Time P50**: [value]ms
|
||||
- **Response Time P95**: [value]ms
|
||||
- **Response Time P99**: [value]ms
|
||||
- **Throughput**: [value] req/s
|
||||
- **Error Rate**: [value]%
|
||||
- **CPU Usage**: [value]%
|
||||
- **Memory Usage**: [value]MB
|
||||
|
||||
## Bottlenecks Identified
|
||||
|
||||
### Bottleneck 1: [Name]
|
||||
- **Type**: [CPU|I/O|Memory|Network]
|
||||
- **Location**: [file:line or component]
|
||||
- **Impact**: [% of total time or resource usage]
|
||||
- **Evidence**:
|
||||
\`\`\`
|
||||
[profiling data or logs showing bottleneck]
|
||||
\`\`\`
|
||||
|
||||
### Bottleneck 2: [Name]
|
||||
[similar structure]
|
||||
|
||||
## Optimizations Implemented
|
||||
|
||||
### Optimization 1: [Name]
|
||||
**Problem**: [what was slow]
|
||||
|
||||
**Solution**: [what was done]
|
||||
|
||||
**Code Changes**:
|
||||
\`\`\`[language]
|
||||
// Before
|
||||
[original slow code]
|
||||
|
||||
// After
|
||||
[optimized code]
|
||||
\`\`\`
|
||||
|
||||
**Impact**:
|
||||
- Response time: [before]ms → [after]ms ([%] improvement)
|
||||
- Resource usage: [before] → [after] ([%] improvement)
|
||||
|
||||
### Optimization 2: [Name]
|
||||
[similar structure]
|
||||
|
||||
## Performance After Optimization
|
||||
|
||||
### After Optimization
|
||||
- **Response Time P50**: [value]ms ✅ [%] improvement
|
||||
- **Response Time P95**: [value]ms ✅ [%] improvement
|
||||
- **Response Time P99**: [value]ms ✅ [%] improvement
|
||||
- **Throughput**: [value] req/s ✅ [x]x improvement
|
||||
- **Error Rate**: [value]% ✅ [%] improvement
|
||||
- **CPU Usage**: [value]% ✅ [%] reduction
|
||||
- **Memory Usage**: [value]MB ✅ [%] reduction
|
||||
|
||||
## Load Testing Results
|
||||
|
||||
### Test Configuration
|
||||
- **Tool**: [k6|artillery|ab]
|
||||
- **Duration**: [duration]
|
||||
- **Peak Load**: [number] concurrent users
|
||||
- **Total Requests**: [number]
|
||||
|
||||
### Results
|
||||
\`\`\`
|
||||
[load test output]
|
||||
\`\`\`
|
||||
|
||||
### Performance Under Load
|
||||
[Description of how system performed under sustained load]
|
||||
|
||||
## Profiling Data
|
||||
|
||||
### CPU Profile
|
||||
[Flame graph or top CPU-consuming functions]
|
||||
|
||||
### Memory Profile
|
||||
[Heap snapshots or memory allocation patterns]
|
||||
|
||||
### Query Performance
|
||||
[Database query analysis results]
|
||||
|
||||
## Monitoring Setup
|
||||
|
||||
### Metrics Added
|
||||
- [Metric 1]: Tracks [what]
|
||||
- [Metric 2]: Tracks [what]
|
||||
|
||||
### Dashboards Created
|
||||
- [Dashboard 1]: [URL and description]
|
||||
- [Dashboard 2]: [URL and description]
|
||||
|
||||
### Alerts Configured
|
||||
- [Alert 1]: Triggers when [condition]
|
||||
- [Alert 2]: Triggers when [condition]
|
||||
|
||||
## Recommendations
|
||||
|
||||
### Additional Optimizations
|
||||
1. [Optimization 1]: [Expected impact]
|
||||
2. [Optimization 2]: [Expected impact]
|
||||
|
||||
### Monitoring
|
||||
1. [What to monitor]
|
||||
2. [What thresholds to set]
|
||||
|
||||
### Future Improvements
|
||||
1. [Long-term improvement 1]
|
||||
2. [Long-term improvement 2]
|
||||
|
||||
## Files Modified
|
||||
- [file1]: [what was changed]
|
||||
- [file2]: [what was changed]
|
||||
|
||||
## Verification Steps
|
||||
|
||||
### How to Verify
|
||||
1. [Step 1]
|
||||
2. [Step 2]
|
||||
|
||||
### Expected Behavior
|
||||
[What should be observed]
|
||||
|
||||
## Next Steps
|
||||
1. [Next step 1]
|
||||
2. [Next step 2]
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Optimization Degrades Performance**:
|
||||
If optimization makes things slower:
|
||||
1. Rollback immediately
|
||||
2. Re-profile to understand why
|
||||
3. Check for introduced overhead
|
||||
4. Verify test methodology
|
||||
|
||||
**Cannot Reproduce Performance Issue**:
|
||||
If issue only occurs in production:
|
||||
1. Compare production vs test environment
|
||||
2. Check production load patterns
|
||||
3. Analyze production metrics
|
||||
4. Consider production data characteristics
|
||||
|
||||
**Optimization Introduces Bugs**:
|
||||
If optimization causes errors:
|
||||
1. Rollback optimization
|
||||
2. Add comprehensive tests
|
||||
3. Implement optimization incrementally
|
||||
4. Verify correctness at each step
|
||||
|
||||
## Integration with Other Operations
|
||||
|
||||
- **Before**: Use `/debug diagnose` to identify performance issues
|
||||
- **Before**: Use `/debug analyze-logs` to understand performance patterns
|
||||
- **After**: Use `/debug fix` to implement optimizations
|
||||
- **Related**: Use `/debug memory` for memory-specific optimization
|
||||
|
||||
## Agent Utilization
|
||||
|
||||
This operation leverages the **10x-fullstack-engineer** agent for:
|
||||
- Identifying performance bottlenecks across the stack
|
||||
- Suggesting appropriate optimization strategies
|
||||
- Implementing code optimizations
|
||||
- Designing comprehensive load tests
|
||||
- Interpreting profiling data
|
||||
695
commands/debug/reproduce.md
Normal file
695
commands/debug/reproduce.md
Normal file
@@ -0,0 +1,695 @@
|
||||
# Reproduce Operation - Issue Reproduction Strategies
|
||||
|
||||
You are executing the **reproduce** operation to create reliable reproduction strategies and test cases for debugging issues.
|
||||
|
||||
## Parameters
|
||||
|
||||
**Received**: `$ARGUMENTS` (after removing 'reproduce' operation name)
|
||||
|
||||
Expected format: `issue:"problem description" [environment:"prod|staging|dev"] [data:"test-data-location"] [steps:"reproduction-steps"] [reliability:"percentage"]`
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Understand Reproduction Requirements
|
||||
|
||||
Gather information about the issue's behavior:
|
||||
|
||||
**Key Questions**:
|
||||
- How often does the issue occur? (100%, 50%, 5%, etc.)
|
||||
- Under what conditions? (specific data, timing, load, etc.)
|
||||
- In which environments? (prod only, all environments)
|
||||
- What is the expected vs actual behavior?
|
||||
- Are there known workarounds?
|
||||
|
||||
**Reproduction Challenges to Identify**:
|
||||
- **Timing-dependent** (race conditions, timeouts)
|
||||
- **Data-dependent** (specific user data, edge cases)
|
||||
- **Environment-dependent** (prod-only config, specific infrastructure)
|
||||
- **Load-dependent** (only under high load or concurrency)
|
||||
- **State-dependent** (requires specific sequence of actions)
|
||||
|
||||
### 2. Gather Reproduction Context
|
||||
|
||||
Collect all information needed to reproduce:
|
||||
|
||||
#### Environment Context
|
||||
|
||||
**Application State**:
|
||||
```bash
|
||||
# Get application version
|
||||
git log -1 --oneline
|
||||
npm list # Node dependencies
|
||||
pip freeze # Python dependencies
|
||||
|
||||
# Get configuration
|
||||
cat .env.production
|
||||
echo $ENVIRONMENT_VARS
|
||||
|
||||
# Get deployed version in production
|
||||
kubectl get deployment app-name -o jsonpath='{.spec.template.spec.containers[0].image}'
|
||||
```
|
||||
|
||||
**Infrastructure State**:
|
||||
```bash
|
||||
# System resources
|
||||
free -m
|
||||
df -h
|
||||
ulimit -a
|
||||
|
||||
# Network configuration
|
||||
ip addr show
|
||||
cat /etc/resolv.conf
|
||||
|
||||
# Service status
|
||||
systemctl status application-service
|
||||
docker ps
|
||||
kubectl get pods
|
||||
```
|
||||
|
||||
#### Data Context
|
||||
|
||||
**Database State**:
|
||||
```sql
|
||||
-- Get relevant data schema
|
||||
\d+ table_name
|
||||
|
||||
-- Get sample data that triggers issue
|
||||
SELECT * FROM users WHERE id = 'problematic-user-id';
|
||||
|
||||
-- Get data statistics
|
||||
SELECT count(*), min(created_at), max(created_at) FROM table_name;
|
||||
|
||||
-- Export test data
|
||||
COPY (SELECT * FROM users WHERE id IN ('user1', 'user2')) TO '/tmp/test_data.csv' CSV HEADER;
|
||||
```
|
||||
|
||||
**Request/Response Data**:
|
||||
```bash
|
||||
# Capture failing request
|
||||
# Use browser DevTools > Network > Copy as cURL
|
||||
|
||||
curl 'https://api.example.com/endpoint' \
|
||||
-H 'Authorization: Bearer TOKEN' \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data-raw '{"key":"value"}' \
|
||||
-v # Verbose output
|
||||
|
||||
# Capture webhook payload
|
||||
# Check logs for incoming webhook data
|
||||
grep "webhook_payload" logs/application.log | jq .
|
||||
```
|
||||
|
||||
#### User Context
|
||||
|
||||
**User Session**:
|
||||
```javascript
|
||||
// Browser state
|
||||
console.log('LocalStorage:', localStorage);
|
||||
console.log('SessionStorage:', sessionStorage);
|
||||
console.log('Cookies:', document.cookie);
|
||||
console.log('User Agent:', navigator.userAgent);
|
||||
|
||||
// Authentication state
|
||||
console.log('Auth Token:', authToken);
|
||||
console.log('Token Payload:', jwt.decode(authToken));
|
||||
console.log('Session ID:', sessionId);
|
||||
```
|
||||
|
||||
**User Actions**:
|
||||
```markdown
|
||||
1. User logs in as user@example.com
|
||||
2. Navigates to /dashboard
|
||||
3. Clicks "Upload File" button
|
||||
4. Selects file > 10MB
|
||||
5. Clicks "Submit"
|
||||
6. Error occurs: "Request Entity Too Large"
|
||||
```
|
||||
|
||||
### 3. Create Local Reproduction
|
||||
|
||||
Develop a strategy to reproduce the issue locally:
|
||||
|
||||
#### Strategy 1: Direct Reproduction
|
||||
|
||||
**For Simple Issues**:
|
||||
```javascript
|
||||
// Create minimal test case
|
||||
function reproduceBug() {
|
||||
// Setup
|
||||
const testData = {
|
||||
userId: 'test-user',
|
||||
file: createLargeFile(15 * 1024 * 1024) // 15MB
|
||||
};
|
||||
|
||||
// Execute problematic operation
|
||||
const result = await uploadFile(testData);
|
||||
|
||||
// Verify issue occurs
|
||||
assert(result.status === 413, 'Expected 413 error');
|
||||
}
|
||||
```
|
||||
|
||||
#### Strategy 2: Environment Simulation
|
||||
|
||||
**For Environment-Specific Issues**:
|
||||
```bash
|
||||
# Replicate production configuration locally
|
||||
cp .env.production .env.local
|
||||
sed -i 's/prod-database/localhost:5432/g' .env.local
|
||||
|
||||
# Use production data dump
|
||||
psql local_db < production_data_dump.sql
|
||||
|
||||
# Run with production-like settings
|
||||
NODE_ENV=production npm start
|
||||
```
|
||||
|
||||
#### Strategy 3: Data-Driven Reproduction
|
||||
|
||||
**For Data-Specific Issues**:
|
||||
```javascript
|
||||
// Load production data that triggers issue
|
||||
const testData = require('./test-data/problematic-user-data.json');
|
||||
|
||||
// Seed database with specific data
|
||||
await db.users.insert(testData.user);
|
||||
await db.orders.insertMany(testData.orders);
|
||||
|
||||
// Execute operation
|
||||
const result = await processOrder(testData.orders[0].id);
|
||||
```
|
||||
|
||||
#### Strategy 4: Timing-Based Reproduction
|
||||
|
||||
**For Race Conditions**:
|
||||
```javascript
|
||||
// Add delays to expose race condition
|
||||
async function reproduceRaceCondition() {
|
||||
// Start two operations simultaneously
|
||||
const [result1, result2] = await Promise.all([
|
||||
operation1(),
|
||||
operation2()
|
||||
]);
|
||||
|
||||
// Or use setTimeout to control timing
|
||||
setTimeout(() => operation1(), 0);
|
||||
setTimeout(() => operation2(), 1); // 1ms delay
|
||||
}
|
||||
|
||||
// Add intentional delays to expose timing issues
|
||||
async function operation() {
|
||||
await fetchData();
|
||||
await sleep(100); // Artificial delay
|
||||
await processData(); // May fail if timing-dependent
|
||||
}
|
||||
```
|
||||
|
||||
#### Strategy 5: Load-Based Reproduction
|
||||
|
||||
**For Performance/Concurrency Issues**:
|
||||
```javascript
|
||||
// Simulate concurrent requests
|
||||
async function reproduceUnderLoad() {
|
||||
const concurrentRequests = 100;
|
||||
const requests = Array(concurrentRequests)
|
||||
.fill(null)
|
||||
.map(() => makeRequest());
|
||||
|
||||
const results = await Promise.allSettled(requests);
|
||||
const failures = results.filter(r => r.status === 'rejected');
|
||||
|
||||
console.log(`Failure rate: ${failures.length}/${concurrentRequests}`);
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
# Use load testing tools
|
||||
ab -n 1000 -c 100 http://localhost:3000/api/endpoint
|
||||
|
||||
# Use k6 for more complex scenarios
|
||||
k6 run load-test.js
|
||||
|
||||
# Monitor during load test
|
||||
watch -n 1 'ps aux | grep node'
|
||||
```
|
||||
|
||||
### 4. Verify Reproduction Reliability
|
||||
|
||||
Test that reproduction is reliable:
|
||||
|
||||
**Reliability Testing**:
|
||||
```javascript
|
||||
async function testReproductionReliability() {
|
||||
const iterations = 50;
|
||||
let failures = 0;
|
||||
|
||||
for (let i = 0; i < iterations; i++) {
|
||||
try {
|
||||
await reproduceIssue();
|
||||
failures++; // Issue reproduced
|
||||
} catch (error) {
|
||||
// Issue did not reproduce
|
||||
}
|
||||
}
|
||||
|
||||
const reliability = (failures / iterations) * 100;
|
||||
console.log(`Reproduction reliability: ${reliability}%`);
|
||||
|
||||
if (reliability < 80) {
|
||||
console.warn('Reproduction is not reliable enough. Need to refine.');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Improve Reliability**:
|
||||
```javascript
|
||||
// If reliability is low, add more constraints
|
||||
async function improvedReproduction() {
|
||||
// 1. Reset state between attempts
|
||||
await resetDatabase();
|
||||
await clearCache();
|
||||
|
||||
// 2. Add specific data constraints
|
||||
const testUser = await createUserWithSpecificProfile({
|
||||
accountAge: 30, // days
|
||||
orderCount: 5,
|
||||
subscriptionTier: 'premium'
|
||||
});
|
||||
|
||||
// 3. Control timing precisely
|
||||
await sleep(100); // Ensure service is ready
|
||||
|
||||
// 4. Set specific environment conditions
|
||||
process.env.FEATURE_FLAG_X = 'true';
|
||||
|
||||
// Execute
|
||||
await reproduceIssue();
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Create Automated Test Case
|
||||
|
||||
Convert reproduction into automated test:
|
||||
|
||||
**Unit Test Example**:
|
||||
```javascript
|
||||
describe('File Upload Bug', () => {
|
||||
beforeEach(async () => {
|
||||
// Setup test environment
|
||||
await resetTestDatabase();
|
||||
await clearUploadDirectory();
|
||||
});
|
||||
|
||||
it('should handle files larger than 10MB', async () => {
|
||||
// Arrange
|
||||
const largeFile = createTestFile(15 * 1024 * 1024);
|
||||
const user = await createTestUser();
|
||||
|
||||
// Act
|
||||
const response = await uploadFile(user.id, largeFile);
|
||||
|
||||
// Assert
|
||||
expect(response.status).toBe(413);
|
||||
expect(response.body.error).toContain('File too large');
|
||||
});
|
||||
|
||||
it('should succeed with files under 10MB', async () => {
|
||||
// Verify issue is specifically about size
|
||||
const smallFile = createTestFile(5 * 1024 * 1024);
|
||||
const user = await createTestUser();
|
||||
|
||||
const response = await uploadFile(user.id, smallFile);
|
||||
|
||||
expect(response.status).toBe(200);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**Integration Test Example**:
|
||||
```javascript
|
||||
describe('Order Processing Race Condition', () => {
|
||||
it('should handle concurrent order updates safely', async () => {
|
||||
// Setup
|
||||
const order = await createTestOrder({ status: 'pending' });
|
||||
|
||||
// Simulate race condition
|
||||
const updatePromises = [
|
||||
updateOrderStatus(order.id, 'processing'),
|
||||
updateOrderStatus(order.id, 'confirmed')
|
||||
];
|
||||
|
||||
// Both should complete without error
|
||||
await Promise.all(updatePromises);
|
||||
|
||||
// Verify final state is consistent
|
||||
const finalOrder = await getOrder(order.id);
|
||||
expect(['processing', 'confirmed']).toContain(finalOrder.status);
|
||||
|
||||
// Verify no data corruption
|
||||
const auditLogs = await getOrderAuditLogs(order.id);
|
||||
expect(auditLogs).toHaveLength(2);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**E2E Test Example**:
|
||||
```javascript
|
||||
describe('Dashboard Load Performance', () => {
|
||||
it('should load dashboard under 2 seconds', async () => {
|
||||
// Setup user with large dataset
|
||||
const user = await createUserWithLargeDataset({
|
||||
orders: 1000,
|
||||
documents: 500
|
||||
});
|
||||
|
||||
// Login
|
||||
await page.goto('/login');
|
||||
await page.fill('#email', user.email);
|
||||
await page.fill('#password', 'testpass123');
|
||||
await page.click('#login-button');
|
||||
|
||||
// Navigate to dashboard and measure time
|
||||
const startTime = Date.now();
|
||||
await page.goto('/dashboard');
|
||||
await page.waitForSelector('.dashboard-loaded');
|
||||
const loadTime = Date.now() - startTime;
|
||||
|
||||
// Assert performance
|
||||
expect(loadTime).toBeLessThan(2000);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 6. Document Reproduction Steps
|
||||
|
||||
Create comprehensive reproduction documentation:
|
||||
|
||||
**Reproduction Guide Template**:
|
||||
```markdown
|
||||
# Reproduction Guide: [Issue Name]
|
||||
|
||||
## Prerequisites
|
||||
- Node.js v18.x
|
||||
- PostgreSQL 14+
|
||||
- Docker (optional)
|
||||
- Test account credentials
|
||||
|
||||
## Environment Setup
|
||||
|
||||
### 1. Clone and Install
|
||||
\`\`\`bash
|
||||
git clone https://github.com/org/repo.git
|
||||
cd repo
|
||||
npm install
|
||||
\`\`\`
|
||||
|
||||
### 2. Database Setup
|
||||
\`\`\`bash
|
||||
# Create test database
|
||||
createdb test_app
|
||||
|
||||
# Load test data
|
||||
psql test_app < test-data/problematic_data.sql
|
||||
\`\`\`
|
||||
|
||||
### 3. Configuration
|
||||
\`\`\`bash
|
||||
# Copy test environment file
|
||||
cp .env.test .env
|
||||
|
||||
# Update with test database URL
|
||||
echo "DATABASE_URL=postgresql://localhost/test_app" >> .env
|
||||
\`\`\`
|
||||
|
||||
## Reproduction Steps
|
||||
|
||||
### Manual Reproduction
|
||||
1. Start the application:
|
||||
\`\`\`bash
|
||||
npm start
|
||||
\`\`\`
|
||||
|
||||
2. Login with test user:
|
||||
- Email: test@example.com
|
||||
- Password: testpass123
|
||||
|
||||
3. Navigate to Dashboard: http://localhost:3000/dashboard
|
||||
|
||||
4. Click "Upload File" button
|
||||
|
||||
5. Select file larger than 10MB from test-data/
|
||||
|
||||
6. Click "Submit"
|
||||
|
||||
7. **Expected**: File uploads successfully
|
||||
**Actual**: 413 Request Entity Too Large error
|
||||
|
||||
### Automated Reproduction
|
||||
\`\`\`bash
|
||||
# Run reproduction test
|
||||
npm test -- tests/reproduction/file-upload-bug.test.js
|
||||
|
||||
# Expected output:
|
||||
# ✓ reproduces 413 error with files > 10MB
|
||||
# ✓ succeeds with files < 10MB
|
||||
\`\`\`
|
||||
|
||||
## Reproduction Reliability
|
||||
- **Success Rate**: 100% (fails every time)
|
||||
- **Environment**: All environments
|
||||
- **Conditions**: File size > 10MB
|
||||
|
||||
## Key Observations
|
||||
- Issue occurs consistently with files > 10MB
|
||||
- Works fine with files ≤ 10MB
|
||||
- Error comes from Nginx, not application
|
||||
- Content-Length header shows correct size
|
||||
|
||||
## Debugging Hints
|
||||
- Check Nginx configuration: `/etc/nginx/nginx.conf`
|
||||
- Look for `client_max_body_size` directive
|
||||
- Application code may be fine, infrastructure issue
|
||||
|
||||
## Related Files
|
||||
- test-data/large-file.bin (15MB test file)
|
||||
- test-data/problematic_data.sql (test database dump)
|
||||
- tests/reproduction/file-upload-bug.test.js (automated test)
|
||||
```
|
||||
|
||||
### 7. Validate Different Scenarios
|
||||
|
||||
Test edge cases and variations:
|
||||
|
||||
**Scenario Matrix**:
|
||||
```javascript
|
||||
const testScenarios = [
|
||||
// Vary file sizes
|
||||
{ fileSize: '1MB', expected: 'success' },
|
||||
{ fileSize: '10MB', expected: 'success' },
|
||||
{ fileSize: '11MB', expected: 'failure' },
|
||||
{ fileSize: '50MB', expected: 'failure' },
|
||||
|
||||
// Vary file types
|
||||
{ fileType: 'image/jpeg', expected: 'success' },
|
||||
{ fileType: 'application/pdf', expected: 'success' },
|
||||
{ fileType: 'video/mp4', expected: 'failure' },
|
||||
|
||||
// Vary user types
|
||||
{ userType: 'free', expected: 'failure' },
|
||||
{ userType: 'premium', expected: 'success' },
|
||||
|
||||
// Vary environments
|
||||
{ environment: 'local', expected: 'success' },
|
||||
{ environment: 'staging', expected: 'failure' },
|
||||
{ environment: 'production', expected: 'failure' }
|
||||
];
|
||||
|
||||
for (const scenario of testScenarios) {
|
||||
const result = await testScenario(scenario);
|
||||
console.log(`Scenario ${JSON.stringify(scenario)}: ${result}`);
|
||||
}
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
```markdown
|
||||
# Reproduction Report: [Issue Name]
|
||||
|
||||
## Summary
|
||||
[Brief description of reproduction strategy and success]
|
||||
|
||||
## Reproduction Reliability
|
||||
- **Success Rate**: [percentage]%
|
||||
- **Environment**: [local|staging|production|all]
|
||||
- **Conditions**: [specific conditions needed]
|
||||
- **Timing**: [immediate|delayed|intermittent]
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### Environment Requirements
|
||||
- [Software requirement 1]
|
||||
- [Software requirement 2]
|
||||
- [Configuration requirement 1]
|
||||
|
||||
### Data Requirements
|
||||
- [Test data 1]
|
||||
- [Test data 2]
|
||||
- [Database state]
|
||||
|
||||
### Access Requirements
|
||||
- [Credentials needed]
|
||||
- [Permissions needed]
|
||||
- [Resources needed]
|
||||
|
||||
## Reproduction Steps
|
||||
|
||||
### Quick Reproduction
|
||||
\`\`\`bash
|
||||
# Fastest way to reproduce
|
||||
[commands to quickly reproduce the issue]
|
||||
\`\`\`
|
||||
|
||||
### Detailed Reproduction
|
||||
|
||||
#### Step 1: [Setup]
|
||||
\`\`\`bash
|
||||
[detailed commands]
|
||||
\`\`\`
|
||||
[Expected result]
|
||||
|
||||
#### Step 2: [Preparation]
|
||||
\`\`\`bash
|
||||
[detailed commands]
|
||||
\`\`\`
|
||||
[Expected result]
|
||||
|
||||
#### Step 3: [Trigger Issue]
|
||||
\`\`\`bash
|
||||
[detailed commands]
|
||||
\`\`\`
|
||||
**Expected**: [expected behavior]
|
||||
**Actual**: [actual behavior with issue]
|
||||
|
||||
## Automated Test Case
|
||||
|
||||
### Test Code
|
||||
\`\`\`[language]
|
||||
[Complete automated test that reproduces the issue]
|
||||
\`\`\`
|
||||
|
||||
### Running the Test
|
||||
\`\`\`bash
|
||||
[command to run the test]
|
||||
\`\`\`
|
||||
|
||||
### Expected Output
|
||||
\`\`\`
|
||||
[what the test output should show]
|
||||
\`\`\`
|
||||
|
||||
## Scenario Variations
|
||||
|
||||
### Variation 1: [Description]
|
||||
- **Conditions**: [conditions]
|
||||
- **Result**: [occurs|does not occur]
|
||||
- **Notes**: [observations]
|
||||
|
||||
### Variation 2: [Description]
|
||||
- **Conditions**: [conditions]
|
||||
- **Result**: [occurs|does not occur]
|
||||
- **Notes**: [observations]
|
||||
|
||||
## Key Observations
|
||||
|
||||
### What Triggers the Issue
|
||||
- [Trigger 1]
|
||||
- [Trigger 2]
|
||||
- [Trigger 3]
|
||||
|
||||
### What Prevents the Issue
|
||||
- [Prevention 1]
|
||||
- [Prevention 2]
|
||||
|
||||
### Minimal Reproduction
|
||||
[Simplest possible way to reproduce]
|
||||
|
||||
## Test Data Files
|
||||
|
||||
### File 1: [filename]
|
||||
**Location**: [path]
|
||||
**Purpose**: [what this file is for]
|
||||
**Contents**: [brief description]
|
||||
|
||||
### File 2: [filename]
|
||||
**Location**: [path]
|
||||
**Purpose**: [what this file is for]
|
||||
**Contents**: [brief description]
|
||||
|
||||
## Troubleshooting Reproduction
|
||||
|
||||
### If Reproduction Fails
|
||||
1. [Check 1]
|
||||
2. [Check 2]
|
||||
3. [Check 3]
|
||||
|
||||
### Common Issues
|
||||
- **Issue**: [problem with reproduction]
|
||||
**Solution**: [how to fix]
|
||||
|
||||
- **Issue**: [problem with reproduction]
|
||||
**Solution**: [how to fix]
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Diagnosis**: Use `/debug diagnose` with reproduction steps
|
||||
2. **Fix**: Use `/debug fix` once root cause is identified
|
||||
3. **Verification**: Re-run reproduction after fix to verify resolution
|
||||
|
||||
## Appendices
|
||||
|
||||
### A. Test Data
|
||||
[Links to or contents of test data files]
|
||||
|
||||
### B. Environment Configuration
|
||||
[Complete environment configuration needed]
|
||||
|
||||
### C. Video/Screenshots
|
||||
[If applicable, links to recordings showing the issue]
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Cannot Reproduce Locally**:
|
||||
If issue cannot be reproduced in local environment:
|
||||
1. Document what was tried
|
||||
2. List environment differences
|
||||
3. Suggest production debugging approach
|
||||
4. Create monitoring to capture more data
|
||||
|
||||
**Unreliable Reproduction**:
|
||||
If reproduction is intermittent:
|
||||
1. Identify factors affecting reliability
|
||||
2. Add more constraints to increase reliability
|
||||
3. Document reliability percentage
|
||||
4. Suggest statistical testing approach
|
||||
|
||||
**Missing Prerequisites**:
|
||||
If prerequisites are unavailable:
|
||||
1. List what's missing
|
||||
2. Suggest alternatives
|
||||
3. Propose workaround strategies
|
||||
4. Document assumptions
|
||||
|
||||
## Integration with Other Operations
|
||||
|
||||
- **Before**: Use `/debug diagnose` to understand the issue first
|
||||
- **After**: Use `/debug fix` to implement the fix
|
||||
- **Related**: Use `/debug analyze-logs` to gather more reproduction context
|
||||
|
||||
## Agent Utilization
|
||||
|
||||
This operation leverages the **10x-fullstack-engineer** agent for:
|
||||
- Creating reliable reproduction strategies
|
||||
- Designing comprehensive test cases
|
||||
- Identifying edge cases and variations
|
||||
- Documenting reproduction steps clearly
|
||||
83
commands/debug/skill.md
Normal file
83
commands/debug/skill.md
Normal file
@@ -0,0 +1,83 @@
|
||||
---
|
||||
description: Comprehensive debugging toolkit for complex issues - diagnosis, reproduction, log analysis, performance, and memory debugging
|
||||
argument-hint: <operation> [parameters...]
|
||||
model: inherit
|
||||
---
|
||||
|
||||
# Debug Skill - Advanced Debugging Operations
|
||||
|
||||
You are routing requests to specialized debugging operations. Parse the `$ARGUMENTS` to determine which debugging operation to execute.
|
||||
|
||||
## Available Operations
|
||||
|
||||
- **diagnose** - Comprehensive diagnosis and root cause analysis across all stack layers
|
||||
- **reproduce** - Create reliable reproduction strategies and test cases for issues
|
||||
- **fix** - Implement targeted fixes with verification and prevention measures
|
||||
- **analyze-logs** - Deep log analysis with pattern detection and timeline correlation
|
||||
- **performance** - Performance debugging, profiling, and optimization
|
||||
- **memory** - Memory leak detection, analysis, and optimization
|
||||
|
||||
## Routing Logic
|
||||
|
||||
Extract the first word from `$ARGUMENTS` as the operation name, and pass the remainder as operation parameters.
|
||||
|
||||
**Arguments received**: `$ARGUMENTS`
|
||||
|
||||
**Routing Instructions**:
|
||||
|
||||
1. **Parse the operation**: Extract the first word from `$ARGUMENTS`
|
||||
2. **Load operation instructions**: Read the corresponding operation file from `.claude/commands/debug/`
|
||||
3. **Execute with context**: Follow the operation's instructions with the remaining parameters
|
||||
4. **Leverage agent**: All operations can leverage the 10x-fullstack-engineer agent for deep expertise
|
||||
|
||||
## Operation Routing
|
||||
|
||||
```
|
||||
diagnose → Read and follow: .claude/commands/debug/diagnose.md
|
||||
reproduce → Read and follow: .claude/commands/debug/reproduce.md
|
||||
fix → Read and follow: .claude/commands/debug/fix.md
|
||||
analyze-logs → Read and follow: .claude/commands/debug/analyze-logs.md
|
||||
performance → Read and follow: .claude/commands/debug/performance.md
|
||||
memory → Read and follow: .claude/commands/debug/memory.md
|
||||
```
|
||||
|
||||
## Base Directory
|
||||
|
||||
All operation files are located at: `.claude/commands/debug/`
|
||||
|
||||
## Error Handling
|
||||
|
||||
If no operation is specified or the operation is not recognized:
|
||||
|
||||
**Available debugging operations**:
|
||||
- `/debug diagnose issue:"..." [environment:"..."] [logs:"..."]` - Comprehensive diagnosis
|
||||
- `/debug reproduce issue:"..." [environment:"..."] [data:"..."]` - Create reproduction strategy
|
||||
- `/debug fix issue:"..." root_cause:"..." [verification:"..."]` - Implement targeted fix
|
||||
- `/debug analyze-logs path:"..." [pattern:"..."] [timeframe:"..."]` - Deep log analysis
|
||||
- `/debug performance component:"..." [metric:"..."] [threshold:"..."]` - Performance debugging
|
||||
- `/debug memory component:"..." [symptom:"..."] [duration:"..."]` - Memory debugging
|
||||
|
||||
**Example usage**:
|
||||
```
|
||||
/debug diagnose issue:"Users getting 500 errors on file upload" environment:"production" logs:"logs/app.log"
|
||||
/debug reproduce issue:"Payment webhook fails intermittently" environment:"staging" data:"sample-webhook-payload.json"
|
||||
/debug fix issue:"Race condition in order processing" root_cause:"Missing transaction lock" verification:"run-integration-tests"
|
||||
/debug analyze-logs path:"logs/application.log" pattern:"ERROR.*timeout" timeframe:"last-24h"
|
||||
/debug performance component:"api-endpoint:/orders" metric:"response-time" threshold:"200ms"
|
||||
/debug memory component:"background-worker" symptom:"growing-heap" duration:"6h"
|
||||
```
|
||||
|
||||
Please specify an operation and provide the necessary parameters.
|
||||
|
||||
## Integration with 10x-fullstack-engineer Agent
|
||||
|
||||
All debugging operations are designed to work seamlessly with the 10x-fullstack-engineer agent, which provides:
|
||||
- Cross-stack debugging expertise
|
||||
- Systematic root cause analysis
|
||||
- Production-grade debugging strategies
|
||||
- Performance and security awareness
|
||||
- Prevention-focused mindset
|
||||
|
||||
## Execution
|
||||
|
||||
Based on the parsed operation from `$ARGUMENTS`, read the appropriate operation file and follow its instructions with the remaining parameters.
|
||||
Reference in New Issue
Block a user