Initial commit
This commit is contained in:
223
skills/scripts/cluster_health.py
Executable file
223
skills/scripts/cluster_health.py
Executable file
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cluster Health Check Script
|
||||
Performs comprehensive cluster health diagnostics
|
||||
"""
|
||||
import json
|
||||
import subprocess
|
||||
from typing import Dict, List, Any
|
||||
from datetime import datetime
|
||||
|
||||
def run_kubectl(args: List[str]) -> Dict[str, Any]:
|
||||
"""Run kubectl command and return parsed JSON"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['kubectl'] + args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
return json.loads(result.stdout) if result.stdout else {}
|
||||
except subprocess.CalledProcessError as e:
|
||||
return {"error": e.stderr}
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "Failed to parse kubectl output"}
|
||||
|
||||
def check_nodes() -> Dict[str, Any]:
|
||||
"""Check node health"""
|
||||
nodes = run_kubectl(['get', 'nodes', '-o', 'json'])
|
||||
if 'error' in nodes:
|
||||
return nodes
|
||||
|
||||
results = {
|
||||
"healthy": 0,
|
||||
"unhealthy": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for node in nodes.get('items', []):
|
||||
name = node['metadata']['name']
|
||||
conditions = node.get('status', {}).get('conditions', [])
|
||||
|
||||
is_ready = False
|
||||
for condition in conditions:
|
||||
if condition['type'] == 'Ready':
|
||||
is_ready = condition['status'] == 'True'
|
||||
if not is_ready:
|
||||
results['unhealthy'] += 1
|
||||
results['issues'].append(f"Node {name} is not Ready")
|
||||
else:
|
||||
results['healthy'] += 1
|
||||
break
|
||||
|
||||
# Check other conditions
|
||||
for condition in conditions:
|
||||
if condition['type'] != 'Ready' and condition['status'] == 'True':
|
||||
results['issues'].append(f"Node {name}: {condition['type']} = {condition['status']}")
|
||||
|
||||
return results
|
||||
|
||||
def check_system_pods() -> Dict[str, Any]:
|
||||
"""Check critical system pods"""
|
||||
namespaces = ['kube-system', 'kube-public', 'kube-node-lease']
|
||||
results = {
|
||||
"healthy": 0,
|
||||
"unhealthy": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for ns in namespaces:
|
||||
pods = run_kubectl(['get', 'pods', '-n', ns, '-o', 'json'])
|
||||
if 'error' in pods:
|
||||
continue
|
||||
|
||||
for pod in pods.get('items', []):
|
||||
name = pod['metadata']['name']
|
||||
phase = pod.get('status', {}).get('phase', 'Unknown')
|
||||
|
||||
if phase == 'Running':
|
||||
# Check if all containers are ready
|
||||
container_statuses = pod.get('status', {}).get('containerStatuses', [])
|
||||
all_ready = all(c.get('ready', False) for c in container_statuses)
|
||||
|
||||
if all_ready:
|
||||
results['healthy'] += 1
|
||||
else:
|
||||
results['unhealthy'] += 1
|
||||
results['issues'].append(f"Pod {ns}/{name}: Containers not ready")
|
||||
elif phase in ['Succeeded', 'Completed']:
|
||||
results['healthy'] += 1
|
||||
else:
|
||||
results['unhealthy'] += 1
|
||||
results['issues'].append(f"Pod {ns}/{name}: Phase is {phase}")
|
||||
|
||||
return results
|
||||
|
||||
def check_pending_pods() -> Dict[str, Any]:
|
||||
"""Check for pods stuck in pending"""
|
||||
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
|
||||
if 'error' in all_pods:
|
||||
return all_pods
|
||||
|
||||
pending = []
|
||||
for pod in all_pods.get('items', []):
|
||||
if pod.get('status', {}).get('phase') == 'Pending':
|
||||
name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
pending.append(f"{namespace}/{name}")
|
||||
|
||||
return {"count": len(pending), "pods": pending}
|
||||
|
||||
def check_failed_pods() -> Dict[str, Any]:
|
||||
"""Check for failed pods"""
|
||||
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
|
||||
if 'error' in all_pods:
|
||||
return all_pods
|
||||
|
||||
failed = []
|
||||
for pod in all_pods.get('items', []):
|
||||
if pod.get('status', {}).get('phase') == 'Failed':
|
||||
name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
failed.append(f"{namespace}/{name}")
|
||||
|
||||
return {"count": len(failed), "pods": failed}
|
||||
|
||||
def check_crashloop_pods() -> Dict[str, Any]:
|
||||
"""Check for pods in crash loop"""
|
||||
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
|
||||
if 'error' in all_pods:
|
||||
return all_pods
|
||||
|
||||
crashloop = []
|
||||
for pod in all_pods.get('items', []):
|
||||
container_statuses = pod.get('status', {}).get('containerStatuses', [])
|
||||
for container in container_statuses:
|
||||
state = container.get('state', {})
|
||||
if 'waiting' in state and 'CrashLoopBackOff' in state['waiting'].get('reason', ''):
|
||||
name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
container_name = container['name']
|
||||
crashloop.append(f"{namespace}/{name} (container: {container_name})")
|
||||
break
|
||||
|
||||
return {"count": len(crashloop), "pods": crashloop}
|
||||
|
||||
def main():
|
||||
print("🏥 Kubernetes Cluster Health Check")
|
||||
print("=" * 60)
|
||||
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||
|
||||
# Check nodes
|
||||
print("🖥️ Node Health:")
|
||||
nodes = check_nodes()
|
||||
if 'error' not in nodes:
|
||||
print(f" ✅ Healthy nodes: {nodes['healthy']}")
|
||||
if nodes['unhealthy'] > 0:
|
||||
print(f" ❌ Unhealthy nodes: {nodes['unhealthy']}")
|
||||
for issue in nodes['issues']:
|
||||
print(f" • {issue}")
|
||||
else:
|
||||
print(f" ❌ Error: {nodes['error']}")
|
||||
print()
|
||||
|
||||
# Check system pods
|
||||
print("🔧 System Pods:")
|
||||
system = check_system_pods()
|
||||
if 'error' not in system:
|
||||
print(f" ✅ Healthy: {system['healthy']}")
|
||||
if system['unhealthy'] > 0:
|
||||
print(f" ⚠️ Unhealthy: {system['unhealthy']}")
|
||||
for issue in system['issues'][:10]: # Show first 10
|
||||
print(f" • {issue}")
|
||||
else:
|
||||
print(f" ❌ Error: {system['error']}")
|
||||
print()
|
||||
|
||||
# Check pending pods
|
||||
print("⏳ Pending Pods:")
|
||||
pending = check_pending_pods()
|
||||
if 'error' not in pending:
|
||||
if pending['count'] == 0:
|
||||
print(" ✅ No pods stuck in pending")
|
||||
else:
|
||||
print(f" ⚠️ {pending['count']} pods in pending state:")
|
||||
for pod in pending['pods'][:10]:
|
||||
print(f" • {pod}")
|
||||
else:
|
||||
print(f" ❌ Error: {pending['error']}")
|
||||
print()
|
||||
|
||||
# Check failed pods
|
||||
print("💥 Failed Pods:")
|
||||
failed = check_failed_pods()
|
||||
if 'error' not in failed:
|
||||
if failed['count'] == 0:
|
||||
print(" ✅ No failed pods")
|
||||
else:
|
||||
print(f" ❌ {failed['count']} pods in failed state:")
|
||||
for pod in failed['pods'][:10]:
|
||||
print(f" • {pod}")
|
||||
else:
|
||||
print(f" ❌ Error: {failed['error']}")
|
||||
print()
|
||||
|
||||
# Check crash loops
|
||||
print("🔄 Crash Loop Pods:")
|
||||
crashloop = check_crashloop_pods()
|
||||
if 'error' not in crashloop:
|
||||
if crashloop['count'] == 0:
|
||||
print(" ✅ No pods in crash loop")
|
||||
else:
|
||||
print(f" ❌ {crashloop['count']} pods in crash loop:")
|
||||
for pod in crashloop['pods'][:10]:
|
||||
print(f" • {pod}")
|
||||
else:
|
||||
print(f" ❌ Error: {crashloop['error']}")
|
||||
print()
|
||||
|
||||
print("=" * 60)
|
||||
print("Health check complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user