Files
gh-greyhaven-ai-claude-code…/skills/devops-troubleshooting/reference/diagnostic-commands.md
2025-11-29 18:29:23 +08:00

8.9 KiB

Diagnostic Commands Reference

Quick command reference for Grey Haven infrastructure troubleshooting. Copy-paste ready commands for rapid diagnosis.

Cloudflare Workers Commands

Deployment Management

# List recent deployments
wrangler deployments list

# View specific deployment
wrangler deployments view <deployment-id>

# Rollback to previous version
wrangler rollback --message "Reverting due to errors"

# Deploy to production
wrangler deploy

# Deploy to staging
wrangler deploy --env staging

Logs and Monitoring

# Real-time logs (pretty format)
wrangler tail --format pretty

# JSON logs for parsing
wrangler tail --format json

# Filter by status code
wrangler tail --format json | grep "\"status\":500"

# Show only errors
wrangler tail --format json | grep -i "error"

# Save logs to file
wrangler tail --format json > worker-logs.json

# Monitor specific worker
wrangler tail --name my-worker

Local Development

# Start local dev server
wrangler dev

# Dev with specific port
wrangler dev --port 8788

# Dev with remote mode (use production bindings)
wrangler dev --remote

# Test locally
curl http://localhost:8787/api/health

Configuration

# Show account info
wrangler whoami

# List KV namespaces
wrangler kv:namespace list

# List secrets
wrangler secret list

# Add secret
wrangler secret put API_KEY

# Delete secret
wrangler secret delete API_KEY

PlanetScale Commands

Database Management

# Connect to database shell
pscale shell greyhaven-db main

# Connect and execute query
pscale shell greyhaven-db main --execute "SELECT COUNT(*) FROM users"

# Show database info
pscale database show greyhaven-db

# List all databases
pscale database list

# Create new branch
pscale branch create greyhaven-db feature-branch

# List branches
pscale branch list greyhaven-db

Connection Monitoring

-- Active connections
SELECT COUNT(*) as active_connections
FROM pg_stat_activity
WHERE state = 'active';

-- Long-running queries
SELECT
  pid,
  now() - query_start as duration,
  query
FROM pg_stat_activity
WHERE state = 'active'
  AND query_start < now() - interval '10 seconds'
ORDER BY duration DESC;

-- Connection by state
SELECT state, COUNT(*)
FROM pg_stat_activity
GROUP BY state;

-- Blocked queries
SELECT
  blocked.pid AS blocked_pid,
  blocking.pid AS blocking_pid,
  blocked.query AS blocked_query
FROM pg_stat_activity blocked
JOIN pg_stat_activity blocking
  ON blocking.pid = ANY(pg_blocking_pids(blocked.pid));

Performance Analysis

# Slow query insights
pscale database insights greyhaven-db main --slow-queries

# Database size
pscale database show greyhaven-db --web

# Enable slow query log
pscale database settings update greyhaven-db --enable-slow-query-log
-- Table sizes
SELECT
  schemaname,
  tablename,
  pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Index usage
SELECT
  schemaname,
  tablename,
  indexname,
  idx_scan as index_scans
FROM pg_stat_user_indexes
ORDER BY idx_scan ASC;

-- Cache hit ratio
SELECT
  'cache hit rate' AS metric,
  sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) AS ratio
FROM pg_statio_user_tables;

Schema Migrations

# Create deploy request
pscale deploy-request create greyhaven-db <branch-name>

# List deploy requests
pscale deploy-request list greyhaven-db

# View deploy request diff
pscale deploy-request diff greyhaven-db <number>

# Deploy schema changes
pscale deploy-request deploy greyhaven-db <number>

# Close deploy request
pscale deploy-request close greyhaven-db <number>

Network Diagnostic Commands

DNS Resolution

# Basic DNS lookup
nslookup api.partner.com

# Detailed DNS query
dig api.partner.com

# Measure DNS time
time nslookup api.partner.com

# Check DNS propagation
dig api.partner.com @8.8.8.8
dig api.partner.com @1.1.1.1

# Reverse DNS lookup
dig -x 203.0.113.42

Connectivity Testing

# Ping test
ping -c 10 api.partner.com

# Trace network route
traceroute api.partner.com

# TCP connection test
nc -zv api.partner.com 443

# Test specific port
telnet api.partner.com 443

HTTP Request Timing

# Full timing breakdown
curl -w "\nDNS Lookup:    %{time_namelookup}s\nTCP Connect:   %{time_connect}s\nTLS Handshake: %{time_appconnect}s\nStart Transfer:%{time_starttransfer}s\nTotal:         %{time_total}s\n" \
  -o /dev/null -s https://api.partner.com/data

# Test with specific method
curl -X POST https://api.example.com/api \
  -H "Content-Type: application/json" \
  -d '{"test": "data"}'

# Follow redirects
curl -L https://example.com

# Show response headers
curl -I https://api.example.com

# Test CORS
curl -I -X OPTIONS https://api.example.com \
  -H "Origin: https://app.example.com" \
  -H "Access-Control-Request-Method: POST"

SSL/TLS Verification

# Check SSL certificate
openssl s_client -connect api.example.com:443

# Show certificate expiry
echo | openssl s_client -connect api.example.com:443 2>/dev/null | \
  openssl x509 -noout -dates

# Verify certificate chain
openssl s_client -connect api.example.com:443 -showcerts

Application Performance Commands

Resource Monitoring

# CPU usage
top -o cpu

# Memory usage
free -h  # Linux
vm_stat  # macOS

# Disk usage
df -h

# Process list
ps aux | grep node

# Port usage
lsof -i :8000
netstat -an | grep 8000

Log Analysis

# Tail logs
tail -f /var/log/app.log

# Search logs
grep -i "error" /var/log/app.log

# Count errors
grep -c "ERROR" /var/log/app.log

# Show recent errors with context
grep -B 5 -A 5 "error" /var/log/app.log

# Parse JSON logs
cat app.log | jq 'select(.level=="error")'

# Error frequency
grep "ERROR" /var/log/app.log | cut -d' ' -f1 | uniq -c

Worker Performance

# Monitor CPU time
wrangler tail --format json | jq '.outcome.cpuTime'

# Monitor duration
wrangler tail --format json | jq '.outcome.duration'

# Requests per second
wrangler tail --format json | wc -l

# Average response time
wrangler tail --format json | \
  jq -r '.outcome.duration' | \
  awk '{sum+=$1; count++} END {print sum/count}'

Health Check Scripts

Worker Health Check

#!/bin/bash
# health-check-worker.sh

echo "=== Worker Health Check ==="

# Test endpoint
STATUS=$(curl -s -o /dev/null -w "%{http_code}" https://api.greyhaven.io/health)

if [ "$STATUS" -eq 200 ]; then
  echo "✅ Worker responding (HTTP $STATUS)"
else
  echo "❌ Worker error (HTTP $STATUS)"
  exit 1
fi

# Check response time
TIME=$(curl -w "%{time_total}" -o /dev/null -s https://api.greyhaven.io/health)
echo "Response time: ${TIME}s"

if (( $(echo "$TIME > 1.0" | bc -l) )); then
  echo "⚠️  Slow response (>${TIME}s)"
fi

Database Health Check

#!/bin/bash
# health-check-db.sh

echo "=== Database Health Check ==="

# Test connection
pscale shell greyhaven-db main --execute "SELECT 1" > /dev/null 2>&1

if [ $? -eq 0 ]; then
  echo "✅ Database connection OK"
else
  echo "❌ Database connection failed"
  exit 1
fi

# Check active connections
ACTIVE=$(pscale shell greyhaven-db main --execute \
  "SELECT COUNT(*) FROM pg_stat_activity WHERE state='active'" | tail -1)

echo "Active connections: $ACTIVE"

if [ "$ACTIVE" -gt 80 ]; then
  echo "⚠️  High connection count (>80)"
fi

Complete System Health

#!/bin/bash
# health-check-all.sh

echo "=== Complete System Health Check ==="

# Worker
echo "\n1. Cloudflare Worker"
./health-check-worker.sh

# Database
echo "\n2. PlanetScale Database"
./health-check-db.sh

# External APIs
echo "\n3. External Dependencies"
for API in "https://api.partner1.com/health" "https://api.partner2.com/health"; do
  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$API")
  if [ "$STATUS" -eq 200 ]; then
    echo "✅ $API (HTTP $STATUS)"
  else
    echo "❌ $API (HTTP $STATUS)"
  fi
done

echo "\n=== Health Check Complete ==="

Troubleshooting One-Liners

# Find memory hogs
ps aux --sort=-%mem | head -10

# Find CPU hogs
ps aux --sort=-%cpu | head -10

# Disk space by directory
du -sh /* | sort -h

# Network connections
netstat -ant | awk '{print $6}' | sort | uniq -c

# Failed login attempts
grep "Failed password" /var/log/auth.log | wc -l

# Top error codes
awk '{print $9}' access.log | sort | uniq -c | sort -rn

# Requests per minute
awk '{print $4}' access.log | cut -d: -f1-2 | uniq -c

# Average response size
awk '{sum+=$10; count++} END {print sum/count}' access.log


Return to reference index