From 411f0d306378309c9a1a39697b7ff7ddb69f0350 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:45:51 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 11 + README.md | 3 + commands/analyze-performance.md | 602 ++++++++++++++++++++++++++++++++ commands/health-check.md | 460 ++++++++++++++++++++++++ plugin.lock.json | 49 +++ 5 files changed, 1125 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 commands/analyze-performance.md create mode 100644 commands/health-check.md create mode 100644 plugin.lock.json diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..6dd2af3 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "etcd", + "description": "Etcd cluster health monitoring and performance analysis utilities", + "version": "0.0.1", + "author": { + "name": "github.com/openshift-eng" + }, + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ca8bb68 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# etcd + +Etcd cluster health monitoring and performance analysis utilities diff --git a/commands/analyze-performance.md b/commands/analyze-performance.md new file mode 100644 index 0000000..58d523a --- /dev/null +++ b/commands/analyze-performance.md @@ -0,0 +1,602 @@ +--- +description: Analyze etcd performance metrics, latency, and identify bottlenecks +argument-hint: "[--duration ]" +--- + +## Name +etcd:analyze-performance + +## Synopsis +``` +/etcd:analyze-performance [--duration ] +``` + +## Description + +The `analyze-performance` command analyzes etcd performance metrics to identify latency issues, slow operations, and potential bottlenecks. It examines disk performance, commit latency, network latency, and provides recommendations for optimization. + +Etcd performance is critical for cluster responsiveness. Slow etcd operations can cause: +- API server timeouts +- Slow pod creation and updates +- Controller delays +- Overall cluster sluggishness + +This command is useful for: +- Diagnosing slow cluster operations +- Identifying disk I/O bottlenecks +- Detecting network latency issues +- Capacity planning +- Performance tuning + +## Prerequisites + +Before using this command, ensure you have: + +1. **OpenShift CLI (oc)** + - Install from: https://mirror.openshift.com/pub/openshift-v4/clients/ocp/ + - Verify with: `oc version` + +2. **Active cluster connection** + - Must be connected to an OpenShift cluster + - Verify with: `oc whoami` + +3. **Cluster admin permissions** + - Required to access etcd pods and metrics + - Verify with: `oc auth can-i get pods -n openshift-etcd` + +4. **Running etcd pods** + - At least one etcd pod must be running + - Check with: `oc get pods -n openshift-etcd -l app=etcd` + +## Arguments + +- **--duration** (optional): Duration in minutes to analyze logs (default: 5) + - Analyzes recent logs for the specified duration + - Longer durations provide more comprehensive analysis + - Example: `--duration 15` for 15-minute window + +## Implementation + +The command performs the following analysis: + +### 1. Verify Prerequisites + +```bash +if ! command -v oc &> /dev/null; then + echo "Error: oc CLI not found" + exit 1 +fi + +if ! oc whoami &> /dev/null; then + echo "Error: Not connected to cluster" + exit 1 +fi + +# Parse duration argument (default: 5 minutes) +DURATION=5 +if [[ "$1" == "--duration" ]] && [[ -n "$2" ]]; then + DURATION=$2 +fi + +echo "Analyzing etcd performance (last $DURATION minutes)..." +``` + +### 2. Get Running Etcd Pod + +```bash +ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') + +if [ -z "$ETCD_POD" ]; then + echo "Error: No running etcd pod found" + exit 1 +fi + +echo "Using etcd pod: $ETCD_POD" +echo "" +``` + +### 3. Analyze Database Performance + +Get database statistics using etcdctl: + +```bash +echo "===============================================" +echo "DATABASE PERFORMANCE ANALYSIS" +echo "===============================================" +echo "" +echo "Fetching database statistics..." + +# Get database sizes from endpoint status +DB_STATUS=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint status --cluster -w json 2>/dev/null) + +echo "Database Statistics:" +echo "$DB_STATUS" | jq -r '.[] | + "Endpoint: \(.Endpoint) + Version: \(.Status.version) + DB Size: \(.Status.dbSize) bytes (\((.Status.dbSize / 1024 / 1024) | floor)MB) + DB In Use: \(.Status.dbSizeInUse) bytes (\((.Status.dbSizeInUse / 1024 / 1024) | floor)MB) + Keys: \(.Status.header.revision) + Raft Index: \(.Status.raftIndex) + Raft Term: \(.Status.raftTerm) + Leader: \(if .Status.leader == .Status.header.member_id then "YES" else "NO" end) +"' + +echo "" +echo "Fragmentation Analysis:" +echo "$DB_STATUS" | jq -r '.[] | + if .Status.dbSize > 0 then + ((.Status.dbSize - .Status.dbSizeInUse) * 100 / .Status.dbSize) as $frag | + "Endpoint: \(.Endpoint) + Fragmentation: \($frag | floor)%" + + if $frag > 50 then + " - WARNING: High fragmentation detected, consider defragmentation" + elif $frag > 30 then + " - NOTICE: Moderate fragmentation" + else + " - OK" + end + else + "Endpoint: \(.Endpoint) + Fragmentation: N/A" + end' +``` + +### 4. Check Cluster Health + +Verify etcd cluster health: + +```bash +echo "" +echo "===============================================" +echo "CLUSTER HEALTH" +echo "===============================================" +echo "" +oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint health --cluster 2>/dev/null || echo "Health check failed" +``` + +### 5. Analyze Logs for Performance Issues + +Parse etcd logs for performance warnings: + +```bash +echo "" +echo "===============================================" +echo "LOG ANALYSIS (Last $DURATION minutes)" +echo "===============================================" +echo "" +echo "Searching for performance-related warnings..." + +# Get recent logs +LOGS=$(oc logs -n openshift-etcd "$ETCD_POD" -c etcd --since="${DURATION}m" 2>/dev/null) + +# Count slow operations +SLOW_OPS=$(echo "$LOGS" | grep -i "slow" | wc -l) +echo "Slow operations logged: $SLOW_OPS" + +if [ "$SLOW_OPS" -gt 0 ]; then + echo "" + echo "Recent slow operations (last 10):" + echo "$LOGS" | grep -i "slow" | tail -10 +fi + +echo "" + +# Check for disk warnings +DISK_WARNINGS=$(echo "$LOGS" | grep -iE "disk|fdatasync|fsync" | grep -iE "slow|took|latency" | wc -l) +echo "Disk-related warnings: $DISK_WARNINGS" + +if [ "$DISK_WARNINGS" -gt 0 ]; then + echo "" + echo "Disk performance warnings:" + echo "$LOGS" | grep -iE "disk|fdatasync|fsync" | grep -iE "slow|took|latency" | tail -5 +fi + +echo "" + +# Check for apply warnings +APPLY_WARNINGS=$(echo "$LOGS" | grep -iE "apply.*took|slow.*apply" | wc -l) +echo "Apply operation warnings: $APPLY_WARNINGS" + +if [ "$APPLY_WARNINGS" -gt 0 ]; then + echo "" + echo "Apply warnings:" + echo "$LOGS" | grep -iE "apply.*took|slow.*apply" | tail -5 +fi + +echo "" + +# Check for compaction info +echo "Recent compaction operations:" +echo "$LOGS" | grep "finished scheduled compaction" | tail -3 +if [ $(echo "$LOGS" | grep "finished scheduled compaction" | wc -l) -eq 0 ]; then + echo " No compaction operations in this time window" +fi + +echo "" + +# Check for snapshot operations +echo "Snapshot operations:" +SNAPSHOTS=$(echo "$LOGS" | grep -i "snapshot" | wc -l) +echo "Snapshot events: $SNAPSHOTS" +if [ "$SNAPSHOTS" -gt 0 ]; then + echo "$LOGS" | grep -i "snapshot" | tail -3 +fi +``` + +### 6. Analyze Leader Stability + +Check for leader changes and stability issues: + +```bash +echo "" +echo "===============================================" +echo "LEADER STABILITY ANALYSIS" +echo "===============================================" +echo "" + +LEADER_CHANGES=$(echo "$LOGS" | grep -i "leader.*changed\|became leader\|lost leader" | wc -l) +echo "Leader change events: $LEADER_CHANGES" + +if [ "$LEADER_CHANGES" -gt 0 ]; then + echo "" + echo "Leader change events:" + echo "$LOGS" | grep -i "leader.*changed\|became leader\|lost leader" +fi + +# Check for proposal/commit issues +echo "" +echo "Proposal and commit operations:" +PROPOSAL_LOGS=$(echo "$LOGS" | grep -iE "proposal|commit" | grep -iE "slow|took|failed" | wc -l) +echo "Slow proposal/commit operations: $PROPOSAL_LOGS" + +if [ "$PROPOSAL_LOGS" -gt 0 ]; then + echo "" + echo "Sample slow operations:" + echo "$LOGS" | grep -iE "proposal|commit" | grep -iE "slow|took|failed" | tail -5 +fi +``` + +### 7. Analyze Network Performance + +Check for network-related issues: + +```bash +echo "" +echo "===============================================" +echo "NETWORK ANALYSIS" +echo "===============================================" +echo "" + +NETWORK_ISSUES=$(echo "$LOGS" | grep -iE "network|connection|timeout|peer" | grep -iE "error|fail|slow" | wc -l) +echo "Network-related issues: $NETWORK_ISSUES" + +if [ "$NETWORK_ISSUES" -gt 0 ]; then + echo "" + echo "Network issues:" + echo "$LOGS" | grep -iE "network|connection|timeout|peer" | grep -iE "error|fail|slow" | tail -5 +fi +``` + +### 8. Generate Performance Summary + +Create summary with recommendations: + +```bash +echo "" +echo "===============================================" +echo "PERFORMANCE SUMMARY & RECOMMENDATIONS" +echo "===============================================" +echo "" + +ISSUES=0 +WARNINGS=0 + +# Check fragmentation from DB status +MAX_FRAG=$(echo "$DB_STATUS" | jq -r '[.[] | if .Status.dbSize > 0 then ((.Status.dbSize - .Status.dbSizeInUse) * 100 / .Status.dbSize) else 0 end] | max') + +if (( $(echo "$MAX_FRAG > 50" | bc -l 2>/dev/null || echo 0) )); then + echo "ISSUE: High database fragmentation (${MAX_FRAG}%)" + echo " Recommendation: Run defragmentation on all etcd members" + echo " Command: oc exec -n openshift-etcd -c etcdctl -- etcdctl defrag" + echo "" + ISSUES=$((ISSUES + 1)) +elif (( $(echo "$MAX_FRAG > 30" | bc -l 2>/dev/null || echo 0) )); then + echo "WARNING: Moderate database fragmentation (${MAX_FRAG}%)" + echo " Recommendation: Monitor and consider defragmentation if performance degrades" + echo "" + WARNINGS=$((WARNINGS + 1)) +fi + +if [ "$LEADER_CHANGES" -gt 5 ]; then + echo "WARNING: Frequent leader changes ($LEADER_CHANGES in last ${DURATION}m)" + echo " Recommendation: Check network stability between etcd nodes" + echo " - Verify network latency between control plane nodes" + echo " - Check for packet loss or network congestion" + echo "" + WARNINGS=$((WARNINGS + 1)) +fi + +if [ "$SLOW_OPS" -gt 10 ]; then + echo "WARNING: High number of slow operations ($SLOW_OPS in last ${DURATION}m)" + echo " Recommendation: Investigate disk I/O and workload patterns" + echo " - Check disk performance with 'fio' benchmarks" + echo " - Review etcd workload and consider optimization" + echo "" + WARNINGS=$((WARNINGS + 1)) +fi + +if [ "$DISK_WARNINGS" -gt 5 ]; then + echo "WARNING: Multiple disk performance warnings ($DISK_WARNINGS in last ${DURATION}m)" + echo " Recommendation: Investigate disk I/O performance" + echo " - Ensure etcd is using SSD/NVMe storage" + echo " - Check for disk saturation or competing I/O" + echo " - Verify disk benchmarks meet etcd requirements (> 50 sequential IOPS)" + echo "" + WARNINGS=$((WARNINGS + 1)) +fi + +# Get average DB size +AVG_DB_SIZE=$(echo "$DB_STATUS" | jq -r '[.[] | .Status.dbSize] | add / length') +AVG_DB_SIZE_MB=$(echo "scale=0; $AVG_DB_SIZE / 1024 / 1024" | bc) + +if [ "$AVG_DB_SIZE_MB" -gt 8000 ]; then + echo "WARNING: Large database size (${AVG_DB_SIZE_MB}MB)" + echo " Recommendation: Review data retention and compaction policies" + echo " - Check event retention policies" + echo " - Consider more frequent compaction" + echo "" + WARNINGS=$((WARNINGS + 1)) +fi + +echo "Performance Metrics Summary:" +echo " - Database size: ${AVG_DB_SIZE_MB}MB (recommended: < 8GB)" +echo " - Fragmentation: ${MAX_FRAG}% (recommended: < 30%)" +echo " - Slow operations (${DURATION}m): $SLOW_OPS (recommended: < 10)" +echo " - Leader changes (${DURATION}m): $LEADER_CHANGES (recommended: < 5)" +echo "" + +if [ "$ISSUES" -eq 0 ] && [ "$WARNINGS" -eq 0 ]; then + echo "Status: ✓ HEALTHY - Performance within acceptable ranges" + exit 0 +elif [ "$ISSUES" -gt 0 ]; then + echo "Status: ✗ CRITICAL - Found $ISSUES performance issues requiring attention" + exit 1 +else + echo "Status: ⚠ WARNING - Found $WARNINGS performance warnings" + exit 0 +fi +``` + +## Return Value + +- **Exit 0**: Performance is acceptable (may have warnings) +- **Exit 1**: Critical performance issues detected + +**Output Format**: +- Structured sections for different performance aspects +- Metrics with percentile values (P50, P99) +- Warnings for values exceeding thresholds +- Recommendations for remediation + +## Examples + +### Example 1: Basic performance analysis +``` +/etcd:analyze-performance +``` + +Output: +``` +=============================================== +ETCD PERFORMANCE ANALYSIS +=============================================== +Analyzing etcd performance (last 5 minutes)... +Using etcd pod: etcd-dis016-p6vvv-master-0.us-central1-a.c.openshift-qe.internal + +=============================================== +DATABASE PERFORMANCE ANALYSIS +=============================================== + +Fetching database statistics... +Database Statistics: +Endpoint: https://10.0.0.5:2379 + Version: 3.5.24 + DB Size: 94941184 bytes (90MB) + DB In Use: 51789824 bytes (49MB) + Keys: 50240 + Raft Index: 57097 + Raft Term: 8 + Leader: YES + +Endpoint: https://10.0.0.3:2379 + Version: 3.5.24 + DB Size: 95363072 bytes (90MB) + DB In Use: 51789824 bytes (49MB) + Keys: 50240 + Raft Index: 57097 + Raft Term: 8 + Leader: NO + +Endpoint: https://10.0.0.6:2379 + Version: 3.5.24 + DB Size: 94613504 bytes (90MB) + DB In Use: 51834880 bytes (49MB) + Keys: 50240 + Raft Index: 57097 + Raft Term: 8 + Leader: NO + +Fragmentation Analysis: +Endpoint: https://10.0.0.5:2379 + Fragmentation: 45% - NOTICE: Moderate fragmentation +Endpoint: https://10.0.0.3:2379 + Fragmentation: 45% - NOTICE: Moderate fragmentation +Endpoint: https://10.0.0.6:2379 + Fragmentation: 45% - NOTICE: Moderate fragmentation + +=============================================== +CLUSTER HEALTH +=============================================== + +https://10.0.0.5:2379 is healthy: successfully committed proposal: took = 9.848973ms +https://10.0.0.3:2379 is healthy: successfully committed proposal: took = 14.309216ms +https://10.0.0.6:2379 is healthy: successfully committed proposal: took = 14.829731ms + +=============================================== +LOG ANALYSIS (Last 5 minutes) +=============================================== + +Searching for performance-related warnings... +Slow operations logged: 0 +Disk-related warnings: 0 +Apply operation warnings: 0 + +Recent compaction operations: +{"level":"info","ts":"2025-11-19T06:15:10.136401Z","caller":"mvcc/kvstore_compaction.go:72","msg":"finished scheduled compaction","compact-revision":48026,"took":"175.577699ms","hash":1330697744} + +=============================================== +LEADER STABILITY ANALYSIS +=============================================== + +Leader change events: 0 + +=============================================== +NETWORK ANALYSIS +=============================================== + +Network-related issues: 0 + +=============================================== +PERFORMANCE SUMMARY & RECOMMENDATIONS +=============================================== + +WARNING: Moderate database fragmentation (45%) + Recommendation: Monitor and consider defragmentation if performance degrades + +Performance Metrics Summary: + - Database size: 90MB (recommended: < 8GB) + - Fragmentation: 45% (recommended: < 30%) + - Slow operations (5m): 0 (recommended: < 10) + - Leader changes (5m): 0 (recommended: < 5) + +Status: ⚠ WARNING - Found 1 performance warnings +``` + +### Example 2: Extended analysis window +``` +/etcd:analyze-performance --duration 30 +``` + +## Common Performance Issues + +### High Database Fragmentation + +**Symptoms**: Database size significantly larger than in-use size (>30% fragmentation) + +**Investigation**: +```bash +# Check current fragmentation +oc exec -n openshift-etcd -c etcdctl -- etcdctl endpoint status --cluster -w json | jq +``` + +**Remediation**: +```bash +# Defragment each etcd member (run one at a time) +oc exec -n openshift-etcd -c etcdctl -- etcdctl defrag --cluster +``` + +**Recommendations**: +- Schedule regular defragmentation during maintenance windows +- Monitor fragmentation trends over time +- Consider defragmentation when >30% fragmented + +### Slow Disk I/O + +**Symptoms**: +- Disk-related warnings in logs (fsync, fdatasync) +- Slow apply operations +- High compaction times (>500ms) + +**Investigation**: +```bash +# Check disk performance on etcd nodes +oc debug node/ -- chroot /host fio --name=test --rw=write --bs=4k --size=1G --direct=1 +``` + +**Recommendations**: +- Use SSD or NVMe storage for etcd +- Ensure dedicated disks for etcd (not shared with OS) +- Check for disk saturation or competing I/O +- Verify disk benchmarks meet etcd requirements (> 50 sequential IOPS) + +### Frequent Leader Changes + +**Symptoms**: Multiple leader change events in logs + +**Investigation**: +```bash +# Test network latency between control plane nodes +oc debug node/ -- ping + +# Check for network packet loss +oc debug node/ -- ping -c 100 +``` + +**Recommendations**: +- Ensure etcd nodes are in same datacenter/availability zone +- Check for network congestion or packet loss +- Verify MTU settings across cluster network +- Review network firewall rules and QoS settings + +### Large Database Size + +**Symptoms**: +- Database size >8GB +- Slow operations +- High memory usage + +**Investigation**: +```bash +# Check database size across cluster +oc exec -n openshift-etcd -c etcdctl -- etcdctl endpoint status --cluster -w table +``` + +**Remediation**: +```bash +# Check event retention settings +oc get kubeapiserver cluster -o yaml | grep -A5 eventTTL + +# Review compaction settings +oc logs -n openshift-etcd -c etcd | grep compaction +``` + +**Recommendations**: +- Review event retention policies +- Consider more frequent compaction +- Check for key churn and unnecessary data +- Monitor database growth trends + +## Security Considerations + +- Metrics may expose cluster operational details +- Requires cluster-admin permissions +- Log analysis may contain sensitive data +- Performance data should be treated as confidential + +## See Also + +- Etcd performance guide: https://etcd.io/docs/latest/tuning/ +- OpenShift etcd docs: https://docs.openshift.com/container-platform/latest/scalability_and_performance/recommended-performance-scale-practices/ +- Related commands: `/etcd:health-check` + +## Notes + +- This command uses `etcdctl` and log analysis rather than direct metrics endpoint access +- Performance thresholds are based on etcd upstream recommendations +- Disk benchmarks should show > 50 sequential IOPS for etcd +- Network latency < 50ms recommended between members +- Analysis is point-in-time; trends require repeated checks over time +- Compatible with etcd 3.5+ (OpenShift 4.x) +- Log analysis window can be adjusted with `--duration` parameter +- For production clusters, consider running during low-traffic periods +- Health check latency is measured by actual proposal commits to the cluster diff --git a/commands/health-check.md b/commands/health-check.md new file mode 100644 index 0000000..cf52e1b --- /dev/null +++ b/commands/health-check.md @@ -0,0 +1,460 @@ +--- +description: Check etcd cluster health, member status, and identify issues +argument-hint: "[--verbose]" +--- + +## Name +etcd:health-check + +## Synopsis +``` +/etcd:health-check [--verbose] +``` + +## Description + +The `health-check` command performs a comprehensive health check of the etcd cluster in an OpenShift environment. It examines etcd member status, cluster health, leadership, connectivity, and identifies potential issues that could affect cluster stability. + +Etcd is the critical key-value store that holds all cluster state for Kubernetes/OpenShift. Issues related to etcd can cause cluster-wide failures, so monitoring its health is essential. + +This command is useful for: +- Diagnosing cluster control plane issues +- Verifying etcd cluster stability +- Identifying split-brain scenarios +- Checking member synchronization +- Detecting disk space issues +- Monitoring etcd performance + +## Prerequisites + +Before using this command, ensure you have: + +1. **OpenShift CLI (oc)** + - Install from: https://mirror.openshift.com/pub/openshift-v4/clients/ocp/ + - Verify with: `oc version` + +2. **Active cluster connection** + - Must be connected to an OpenShift cluster + - Verify with: `oc whoami` + +3. **Cluster admin permissions** + - Required to access etcd pods and execute commands + - Verify with: `oc auth can-i get pods -n openshift-etcd` + +4. **Healthy etcd namespace** + - The openshift-etcd namespace must exist + - At least one etcd pod must be running + +## Arguments + +- **--verbose** (optional): Enable detailed output + - Shows etcd member details + - Displays performance metrics + - Includes log snippets for errors + - Provides additional diagnostic information + +## Implementation + +The command performs the following checks: + +### 1. Verify Prerequisites + +Check if oc CLI is available and cluster is accessible: + +```bash +if ! command -v oc &> /dev/null; then + echo "Error: oc CLI not found. Please install OpenShift CLI." + exit 1 +fi + +if ! oc whoami &> /dev/null; then + echo "Error: Not connected to an OpenShift cluster." + exit 1 +fi +``` + +### 2. Check Etcd Namespace and Pods + +Verify the etcd namespace exists and get pod status: + +```bash +echo "Checking etcd namespace and pods..." + +if ! oc get namespace openshift-etcd &> /dev/null; then + echo "CRITICAL: openshift-etcd namespace not found" + exit 1 +fi + +# Get etcd pod status +ETCD_PODS=$(oc get pods -n openshift-etcd -l app=etcd -o json) +TOTAL_PODS=$(echo "$ETCD_PODS" | jq '.items | length') +RUNNING_PODS=$(echo "$ETCD_PODS" | jq '[.items[] | select(.status.phase == "Running")] | length') + +echo "Etcd pods: $RUNNING_PODS/$TOTAL_PODS running" + +if [ "$RUNNING_PODS" -eq 0 ]; then + echo "CRITICAL: No etcd pods are running" + exit 1 +fi + +# List all etcd pods with status +echo "" +echo "Etcd Pod Status:" +oc get pods -n openshift-etcd -l app=etcd -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,READY:.status.containerStatuses[0].ready,RESTARTS:.status.containerStatuses[0].restartCount,NODE:.spec.nodeName +``` + +### 3. Check Etcd Cluster Health + +Use etcdctl to check cluster health from each running etcd pod: + +```bash +echo "" +echo "Checking etcd cluster health..." + +# Get the first running etcd pod +ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') + +if [ -z "$ETCD_POD" ]; then + echo "CRITICAL: No running etcd pod found" + exit 1 +fi + +# Check cluster health +HEALTH_OUTPUT=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint health --cluster -w table 2>&1) + +if echo "$HEALTH_OUTPUT" | grep -q "is healthy"; then + echo "Cluster Health Status:" + echo "$HEALTH_OUTPUT" +else + echo "CRITICAL: Etcd cluster health check failed" + echo "$HEALTH_OUTPUT" + exit 1 +fi +``` + +### 4. Check Etcd Member List + +List all etcd members and verify quorum: + +```bash +echo "" +echo "Checking etcd member list..." + +MEMBER_LIST=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl member list -w table 2>&1) + +echo "Etcd Members:" +echo "$MEMBER_LIST" + +# Count members +MEMBER_COUNT=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl member list -w json 2>/dev/null | jq '.members | length') + +echo "" +echo "Total members: $MEMBER_COUNT" + +if [ "$MEMBER_COUNT" -lt 3 ]; then + echo "WARNING: Etcd cluster has less than 3 members (quorum at risk)" +fi + +# Check for unstarted members +UNSTARTED=$(echo "$MEMBER_LIST" | grep "unstarted" | wc -l) +if [ "$UNSTARTED" -gt 0 ]; then + echo "WARNING: $UNSTARTED member(s) in unstarted state" +fi +``` + +### 5. Check Etcd Leadership + +Verify there is a healthy leader: + +```bash +echo "" +echo "Checking etcd leadership..." + +ENDPOINT_STATUS=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint status --cluster -w table 2>&1) + +echo "Endpoint Status:" +echo "$ENDPOINT_STATUS" + +# Check if there's a leader +if echo "$ENDPOINT_STATUS" | grep -q "true"; then + LEADER_ENDPOINT=$(echo "$ENDPOINT_STATUS" | grep "true" | awk '{print $2}') + echo "" + echo "Leader: $LEADER_ENDPOINT" +else + echo "CRITICAL: No etcd leader elected" + exit 1 +fi +``` + +### 6. Check Etcd Database Size + +Check database size and fragmentation: + +```bash +echo "" +echo "Checking etcd database size..." + +# Get database size from endpoint status +DB_SIZE=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint status --cluster -w json 2>/dev/null) + +echo "$DB_SIZE" | jq -r '.[] | "Endpoint: \(.Endpoint) | DB Size: \(.Status.dbSize) bytes | DB Size in Use: \(.Status.dbSizeInUse) bytes"' + +# Calculate fragmentation percentage +echo "$DB_SIZE" | jq -r '.[] | + if .Status.dbSize > 0 then + "Fragmentation: \(((.Status.dbSize - .Status.dbSizeInUse) * 100 / .Status.dbSize) | floor)%" + else + "Fragmentation: N/A" + end' + +# Warn if database is too large +MAX_DB_SIZE=$((8 * 1024 * 1024 * 1024)) # 8GB threshold +CURRENT_SIZE=$(echo "$DB_SIZE" | jq -r '.[0].Status.dbSize') + +if [ "$CURRENT_SIZE" -gt "$MAX_DB_SIZE" ]; then + echo "WARNING: Etcd database size ($CURRENT_SIZE bytes) exceeds recommended maximum (8GB)" + echo "Consider defragmentation or checking for excessive key growth" +fi +``` + +### 7. Check Disk Space on Etcd Nodes + +Verify disk space on nodes running etcd: + +```bash +echo "" +echo "Checking disk space on etcd nodes..." + +for pod in $(oc get pods -n openshift-etcd -l app=etcd --field-selector=status.phase=Running -o jsonpath='{.items[*].metadata.name}'); do + echo "Pod: $pod" + oc exec -n openshift-etcd "$pod" -c etcd -- df -h /var/lib/etcd | tail -1 + + # Get disk usage percentage + DISK_USAGE=$(oc exec -n openshift-etcd "$pod" -c etcd -- df -h /var/lib/etcd | tail -1 | awk '{print $5}' | sed 's/%//') + + if [ "$DISK_USAGE" -gt 80 ]; then + echo "WARNING: Disk usage on $pod is ${DISK_USAGE}% (threshold: 80%)" + fi + echo "" +done +``` + +### 8. Check for Recent Etcd Errors + +Check recent logs for errors or warnings: + +```bash +echo "" +echo "Checking recent etcd logs for errors..." + +RECENT_ERRORS=$(oc logs -n openshift-etcd "$ETCD_POD" -c etcd --tail=100 | grep -i "error\|warn\|fatal" | tail -10) + +if [ -n "$RECENT_ERRORS" ]; then + echo "Recent errors/warnings found:" + echo "$RECENT_ERRORS" +else + echo "No recent errors in etcd logs" +fi +``` + +### 9. Check Etcd Performance Metrics (if --verbose) + +If verbose mode is enabled, check performance metrics: + +```bash +if [ "$VERBOSE" = "true" ]; then + echo "" + echo "Checking etcd performance metrics..." + + # Get metrics from etcd pod + METRICS=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcd -- curl -s http://localhost:2379/metrics 2>/dev/null) + + # Parse key metrics + echo "Backend Commit Duration (p99):" + echo "$METRICS" | grep "etcd_disk_backend_commit_duration_seconds" | grep "quantile=\"0.99\"" | head -1 + + echo "" + echo "WAL Fsync Duration (p99):" + echo "$METRICS" | grep "etcd_disk_wal_fsync_duration_seconds" | grep "quantile=\"0.99\"" | head -1 + + echo "" + echo "Leader Changes:" + echo "$METRICS" | grep "etcd_server_leader_changes_seen_total" | head -1 +fi +``` + +### 10. Generate Summary Report + +Create a summary of findings: + +```bash +echo "" +echo "===============================================" +echo "Etcd Health Check Summary" +echo "===============================================" +echo "Check Time: $(date)" +echo "Cluster: $(oc whoami --show-server)" +echo "" +echo "Results:" +echo " Etcd Pods Running: $RUNNING_PODS/$TOTAL_PODS" +echo " Cluster Members: $MEMBER_COUNT" +echo " Leader Elected: Yes" +echo " Cluster Health: Healthy" +echo "" + +if [ "$WARNINGS" -gt 0 ]; then + echo "Status: WARNING - Found $WARNINGS warnings requiring attention" + exit 0 +else + echo "Status: HEALTHY - All checks passed" + exit 0 +fi +``` + +## Return Value + +The command returns different exit codes: + +- **Exit 0**: Etcd cluster is healthy (may have warnings) +- **Exit 1**: Critical issues detected (no running pods, no leader, health check failed) + +**Output Format**: +- Human-readable report with section headers +- Critical issues marked with "CRITICAL:" +- Warnings marked with "WARNING:" +- Success indicators for healthy checks + +## Examples + +### Example 1: Basic health check +``` +/etcd:health-check +``` + +Output: +``` +Checking etcd namespace and pods... +Etcd pods: 3/3 running + +Etcd Pod Status: +NAME STATUS READY RESTARTS NODE +etcd-ip-10-0-21-125.us-east-2... Running true 0 ip-10-0-21-125 +etcd-ip-10-0-43-249.us-east-2... Running true 0 ip-10-0-43-249 +etcd-ip-10-0-68-109.us-east-2... Running true 0 ip-10-0-68-109 + +Checking etcd cluster health... +Cluster Health Status: ++------------------------------------------+--------+ +| ENDPOINT | HEALTH | ++------------------------------------------+--------+ +| https://10.0.21.125:2379 | true | +| https://10.0.43.249:2379 | true | +| https://10.0.68.109:2379 | true | ++------------------------------------------+--------+ + +Checking etcd member list... +Etcd Members: ++------------------+---------+------------------------+ +| ID | STATUS | NAME | ++------------------+---------+------------------------+ +| 3a2b1c4d5e6f7890 | started | ip-10-0-21-125 | +| 4b3c2d5e6f708901 | started | ip-10-0-43-249 | +| 5c4d3e6f70890123 | started | ip-10-0-68-109 | ++------------------+---------+------------------------+ + +Total members: 3 + +Checking etcd leadership... +Leader: https://10.0.21.125:2379 + +=============================================== +Etcd Health Check Summary +=============================================== +Status: HEALTHY - All checks passed +``` + +### Example 2: Verbose health check with metrics +``` +/etcd:health-check --verbose +``` + +## Common Issues and Remediation + +### No Etcd Leader + +**Symptoms**: Cluster shows no leader elected + +**Investigation**: +```bash +oc logs -n openshift-etcd -c etcd | grep -i "leader" +oc get events -n openshift-etcd +``` + +**Remediation**: +- Check network connectivity between etcd members +- Verify etcd pods are running on different nodes +- Check for clock skew between nodes + +### High Database Size + +**Symptoms**: Database size exceeds 8GB + +**Investigation**: +```bash +oc exec -n openshift-etcd -c etcdctl -- etcdctl endpoint status -w table +``` + +**Remediation**: +- Run defragmentation: `/etcd:defrag` (if command exists) +- Check for excessive key creation (e.g., many events) +- Review retention policies + +### Disk Space Issues + +**Symptoms**: Disk usage > 80% on etcd data directory + +**Investigation**: +```bash +oc exec -n openshift-etcd -c etcd -- df -h /var/lib/etcd +``` + +**Remediation**: +- Clean up old snapshots +- Defragment database +- Increase disk size if needed + +### Member Not Started + +**Symptoms**: Member shows "unstarted" status + +**Investigation**: +```bash +oc logs -n openshift-etcd -c etcd +oc describe pod -n openshift-etcd +``` + +**Remediation**: +- Check pod logs for errors +- Verify certificates are valid +- Check network policies and firewall rules + +## Security Considerations + +- Requires cluster-admin or equivalent permissions +- Access to etcd data allows viewing all cluster secrets +- Etcd metrics may contain sensitive information +- Always use secure connections when accessing etcd + +## See Also + +- Etcd documentation: https://etcd.io/docs/ +- OpenShift etcd docs: https://docs.openshift.com/container-platform/latest/backup_and_restore/control_plane_backup_and_restore/ +- Related commands: `/etcd:analyze-performance` + +## Notes + +- This command is read-only and does not modify etcd +- Checks are performed from within etcd pods using etcdctl +- Some checks require etcd to be running +- Performance may vary on large clusters with many keys +- Database size recommendations are based on upstream etcd guidance diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..656fc49 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,49 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:openshift-eng/ai-helpers:plugins/etcd", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "7a881888af5024df88b834e135f03c399e2a0d6b", + "treeHash": "5cfa8d7cda8d6cc845b58e28b05232ed3d430612a490fa0f639a925828f33e9f", + "generatedAt": "2025-11-28T10:27:30.314810Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "etcd", + "description": "Etcd cluster health monitoring and performance analysis utilities", + "version": "0.0.1" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "0a66e8e8db6638908533821d2e48f7d355a9c80710f9271272825146315e04cd" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "214e2fa147b184331a42b251f7d50accc4300fac8f46d941db14c7a8a2e37353" + }, + { + "path": "commands/analyze-performance.md", + "sha256": "7530e6beaa860a8c13f878de23434e426e76f8a70c90ac0b5810e30867c4ccef" + }, + { + "path": "commands/health-check.md", + "sha256": "1f3a1f2615e3c397788b758ae5d500277e16c7e57ad06692376488e69c0a1630" + } + ], + "dirSha256": "5cfa8d7cda8d6cc845b58e28b05232ed3d430612a490fa0f639a925828f33e9f" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file