#!/usr/bin/env -S uv run --script --quiet # /// script # requires-python = ">=3.11" # dependencies = [] # /// """ Proxmox Cluster Health Checker Validates Proxmox cluster health including: - Cluster quorum status - Node membership and status - Corosync ring health - Resource manager status - Configuration version sync Usage: python check_cluster_health.py [--node NODE] [--json] Examples: # Check cluster health (requires SSH access to cluster node) python check_cluster_health.py --node foxtrot # Output as JSON for parsing python check_cluster_health.py --node foxtrot --json """ import argparse import json import re import subprocess import sys from dataclasses import dataclass, asdict from typing import Dict, List, Optional @dataclass class NodeStatus: """Cluster node status""" name: str online: bool node_id: int ip: str @dataclass class CorosyncStatus: """Corosync ring status""" ring_id: int nodes: List[str] status: str @dataclass class ClusterHealth: """Overall cluster health""" cluster_name: str quorate: bool node_count: int expected_votes: int total_votes: int nodes: List[NodeStatus] corosync_rings: List[CorosyncStatus] config_version: Optional[int] warnings: List[str] errors: List[str] @property def is_healthy(self) -> bool: """Check if cluster is in healthy state""" return self.quorate and len(self.errors) == 0 class ClusterHealthChecker: """Check Proxmox cluster health via SSH""" def __init__(self, node: str): # Validate node is a valid hostname or IP address if not self._validate_node(node): raise ValueError(f"Invalid node name or IP address: {node}") self.node = node self.health = ClusterHealth( cluster_name="", quorate=False, node_count=0, expected_votes=0, total_votes=0, nodes=[], corosync_rings=[], config_version=None, warnings=[], errors=[] ) def _validate_node(self, node: str) -> bool: """Validate node is a valid hostname or IP address""" import re # Allow valid hostnames and IPv4/IPv6 addresses hostname_pattern = r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$' ipv4_pattern = r'^(\d{1,3}\.){3}\d{1,3}$' ipv6_pattern = r'^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$' return bool( re.match(hostname_pattern, node) or re.match(ipv4_pattern, node) or re.match(ipv6_pattern, node) ) def run_command(self, command: str) -> str: """Execute command on remote node via SSH""" try: # Use -- to prevent SSH option injection result = subprocess.run( ["ssh", "-o", "BatchMode=yes", f"root@{self.node}", "--", command], capture_output=True, text=True, check=True, timeout=30 ) return result.stdout except subprocess.TimeoutExpired: self.health.errors.append(f"Command timed out: {command}") return "" except subprocess.CalledProcessError as e: self.health.errors.append(f"Command failed: {command}: {e.stderr}") return "" def check_cluster_status(self): """Check pvecm status output""" output = self.run_command("pvecm status") if not output: self.health.errors.append("Failed to get cluster status") return # Parse cluster name cluster_match = re.search(r'Cluster name:\s+(\S+)', output) if cluster_match: self.health.cluster_name = cluster_match.group(1) # Parse quorum status quorum_match = re.search(r'Quorate:\s+(\w+)', output) if quorum_match: self.health.quorate = quorum_match.group(1).lower() == 'yes' if not self.health.quorate: self.health.errors.append("Cluster does not have quorum!") # Parse node count node_match = re.search(r'Nodes:\s+(\d+)', output) if node_match: self.health.node_count = int(node_match.group(1)) # Parse expected votes expected_match = re.search(r'Expected votes:\s+(\d+)', output) if expected_match: self.health.expected_votes = int(expected_match.group(1)) # Parse total votes total_match = re.search(r'Total votes:\s+(\d+)', output) if total_match: self.health.total_votes = int(total_match.group(1)) # Check if we have majority if self.health.total_votes < (self.health.expected_votes // 2 + 1): self.health.errors.append( f"Insufficient votes: {self.health.total_votes}/{self.health.expected_votes}" ) def check_nodes(self): """Check node membership""" output = self.run_command("pvecm nodes") if not output: self.health.warnings.append("Failed to get node list") return # Parse node list (skip header) lines = output.strip().split('\n')[1:] # Skip header for line in lines: if not line.strip(): continue # Example: " 1 0x00000001 foxtrot 192.168.3.5" parts = line.split() if len(parts) >= 3: try: node_id = int(parts[0]) name = parts[2] if len(parts) >= 3 else "unknown" ip = parts[3] if len(parts) >= 4 else "unknown" online = True # If in list, assumed online self.health.nodes.append(NodeStatus( name=name, online=online, node_id=node_id, ip=ip )) except (ValueError, IndexError) as e: self.health.warnings.append(f"Failed to parse node line: {line}: {e}") # Verify expected node count if len(self.health.nodes) != self.health.node_count: self.health.warnings.append( f"Node count mismatch: expected {self.health.node_count}, found {len(self.health.nodes)}" ) def check_corosync(self): """Check corosync ring status""" output = self.run_command("corosync-cfgtool -s") if not output: self.health.warnings.append("Failed to get corosync status") return # Parse corosync status # Example output: # Printing ring status. # Local node ID 1 # RING ID 0 # id = 192.168.8.5 # status = ring 0 active with no faults current_ring = None for line in output.split('\n'): line = line.strip() if line.startswith('RING ID'): ring_match = re.search(r'RING ID (\d+)', line) if ring_match: current_ring = int(ring_match.group(1)) elif 'status' in line.lower() and current_ring is not None: status_match = re.search(r'status\s*=\s*(.+)', line) if status_match: status = status_match.group(1) # Check for faults if 'no faults' not in status.lower(): self.health.errors.append(f"Corosync ring {current_ring}: {status}") self.health.corosync_rings.append(CorosyncStatus( ring_id=current_ring, nodes=[], # Could parse this if needed status=status )) def check_config_version(self): """Check cluster configuration version""" output = self.run_command("corosync-cmapctl -b totem.config_version") if output: try: self.health.config_version = int(output.strip()) except ValueError: self.health.warnings.append("Failed to parse config version") def check_resource_manager(self): """Check pve-cluster service status""" output = self.run_command("systemctl is-active pve-cluster") if output.strip() != "active": self.health.errors.append("pve-cluster service is not active") # Check pmxcfs filesystem output = self.run_command("pvecm status | grep -i 'cluster filesystem'") if output and 'online' not in output.lower(): self.health.warnings.append("Cluster filesystem may not be online") def run_all_checks(self) -> ClusterHealth: """Run all health checks""" self.check_cluster_status() self.check_nodes() self.check_corosync() self.check_config_version() self.check_resource_manager() return self.health def main(): parser = argparse.ArgumentParser( description="Check Proxmox cluster health", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) parser.add_argument( '--node', default='foxtrot', help='Cluster node to check (default: foxtrot)' ) parser.add_argument( '--json', action='store_true', help='Output as JSON' ) args = parser.parse_args() # Run health checks checker = ClusterHealthChecker(args.node) health = checker.run_all_checks() if args.json: # Output as JSON print(json.dumps(asdict(health), indent=2)) else: # Human-readable output print(f"Cluster Health Check: {health.cluster_name}") print("=" * 60) print(f"Quorum Status: {'✓ YES' if health.quorate else '✗ NO'}") print(f"Nodes: {health.node_count} ({health.total_votes}/{health.expected_votes} votes)") if health.config_version: print(f"Config Version: {health.config_version}") print("\nNodes:") for node in health.nodes: status = "✓" if node.online else "✗" print(f" {status} {node.name} (ID: {node.node_id}, IP: {node.ip})") print("\nCorosync Rings:") for ring in health.corosync_rings: print(f" Ring {ring.ring_id}: {ring.status}") if health.warnings: print("\nWarnings:") for warning in health.warnings: print(f" ⚠ {warning}") if health.errors: print("\nErrors:") for error in health.errors: print(f" ✗ {error}") print("\n" + "=" * 60) if health.is_healthy: print("Status: ✓ HEALTHY") sys.exit(0) else: print("Status: ✗ UNHEALTHY") sys.exit(1) if __name__ == '__main__': main()